Spaces:
Sleeping
Sleeping
JayWadekar
commited on
Commit
·
6855cb4
1
Parent(s):
b3882e1
First commit
Browse files- README.md +8 -7
- app.py +59 -51
- cookies.txt +4 -0
- gitattributes +35 -0
- ims/gwIASlogo.jpg +0 -0
- ims/userpic.png +0 -0
- rag.py +57 -0
- requirements.txt +10 -1
- urls.txt +19 -0
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
|
|
|
11 |
---
|
12 |
|
13 |
-
|
|
|
1 |
---
|
2 |
+
title: CAMELSDocBot
|
3 |
+
emoji: 🐨
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.5.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
+
short_description: Chatbot assistant for the CAMELS simulations documentation
|
12 |
---
|
13 |
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,64 +1,72 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
history: list[tuple[str, str]],
|
13 |
-
system_message,
|
14 |
-
max_tokens,
|
15 |
-
temperature,
|
16 |
-
top_p,
|
17 |
-
):
|
18 |
-
messages = [{"role": "system", "content": system_message}]
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
messages.append({"role": "assistant", "content": val[1]})
|
25 |
|
26 |
-
|
|
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
for message in client.chat_completion(
|
31 |
-
messages,
|
32 |
-
max_tokens=max_tokens,
|
33 |
-
stream=True,
|
34 |
-
temperature=temperature,
|
35 |
-
top_p=top_p,
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
|
39 |
-
|
40 |
-
yield response
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
""
|
46 |
-
|
47 |
-
|
48 |
-
additional_inputs=[
|
49 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
50 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
51 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
52 |
-
gr.Slider(
|
53 |
-
minimum=0.1,
|
54 |
-
maximum=1.0,
|
55 |
-
value=0.95,
|
56 |
-
step=0.05,
|
57 |
-
label="Top-p (nucleus sampling)",
|
58 |
-
),
|
59 |
-
],
|
60 |
-
)
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
-
if __name__ == "__main__":
|
64 |
demo.launch()
|
|
|
|
|
|
|
|
|
|
1 |
+
# AI assistant with a RAG system to query information from
|
2 |
+
# the gwIAS search pipline
|
3 |
+
# using Langchain and deployed with Gradio
|
4 |
+
|
5 |
+
# Thanks to Pablo Villanueva Domingo for sharing his CAMELS template
|
6 |
+
# https://huggingface.co/spaces/PabloVD/CAMELSDocBot
|
7 |
|
8 |
+
from rag import RAG, load_docs
|
9 |
+
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
10 |
+
from langchain.chat_models import ChatOpenAI
|
11 |
+
import gradio as gr
|
12 |
|
13 |
+
# Load the documentation
|
14 |
+
docs = load_docs()
|
15 |
+
print("Pages loaded:", len(docs))
|
16 |
|
17 |
+
# LLM model
|
18 |
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
# Embeddings
|
21 |
+
embed_model = "sentence-transformers/multi-qa-distilbert-cos-v1"
|
22 |
+
# embed_model = "nvidia/NV-Embed-v2"
|
23 |
+
embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model)
|
|
|
24 |
|
25 |
+
# RAG chain
|
26 |
+
rag_chain = RAG(llm, docs, embeddings)
|
27 |
|
28 |
+
# Function to handle prompt and query the RAG chain
|
29 |
+
def handle_prompt(message, history):
|
30 |
+
try:
|
31 |
+
# Stream output
|
32 |
+
out = ""
|
33 |
+
for chunk in rag_chain.stream(message):
|
34 |
+
out += chunk
|
35 |
+
yield out
|
36 |
+
except Exception as e:
|
37 |
+
raise gr.Error(f"An error occurred: {str(e)}")
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
if __name__ == "__main__":
|
|
|
41 |
|
42 |
+
# Predefined messages and examples
|
43 |
+
description = "AI powered assistant to help with [gwIAS](https://github.com/JayWadekar/gwIAS-HM) gravitational wave search pipeline."
|
44 |
+
greetingsmessage = "Hi, I'm the gwIAS Bot, I'm here to assist you with the search pipeline."
|
45 |
+
example_questions = [
|
46 |
+
"Can you give me the code for calculating coherent score?",
|
47 |
+
"Which module in the code is used for collecting coincident triggers?",
|
48 |
+
"How are template banks constructed?"
|
49 |
+
]
|
50 |
|
51 |
+
# Define customized Gradio chatbot
|
52 |
+
chatbot = gr.Chatbot([{"role": "assistant", "content": greetingsmessage}],
|
53 |
+
type="messages",
|
54 |
+
avatar_images=["ims/userpic.png", "ims/gwIASlogo.jpg"],
|
55 |
+
height="60vh")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
# Define Gradio interface
|
58 |
+
demo = gr.ChatInterface(handle_prompt,
|
59 |
+
type="messages",
|
60 |
+
title="gwIAS DocBot",
|
61 |
+
fill_height=True,
|
62 |
+
examples=example_questions,
|
63 |
+
theme=gr.themes.Soft(),
|
64 |
+
description=description,
|
65 |
+
# cache_examples=False,
|
66 |
+
chatbot=chatbot)
|
67 |
|
|
|
68 |
demo.launch()
|
69 |
+
|
70 |
+
# https://arxiv.org/html/2405.17400v2
|
71 |
+
# https://arxiv.org/html/2312.06631v1
|
72 |
+
# https://arxiv.org/html/2310.15233v2
|
cookies.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Netscape HTTP Cookie File
|
2 |
+
# https://curl.haxx.se/docs/http-cookies.html
|
3 |
+
# This file was generated by libcurl! Edit at your own risk.
|
4 |
+
|
gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
ims/gwIASlogo.jpg
ADDED
![]() |
ims/userpic.png
ADDED
![]() |
rag.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Utilities to build a RAG system to query information from the
|
2 |
+
# gwIAS search pipeline using Langchain
|
3 |
+
|
4 |
+
# Thanks to Pablo Villanueva Domingo for sharing his CAMELS template
|
5 |
+
# https://huggingface.co/spaces/PabloVD/CAMELSDocBot
|
6 |
+
|
7 |
+
from langchain import hub
|
8 |
+
from langchain_chroma import Chroma
|
9 |
+
from langchain_core.output_parsers import StrOutputParser
|
10 |
+
from langchain_core.runnables import RunnablePassthrough
|
11 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
12 |
+
from langchain_community.document_loaders import WebBaseLoader
|
13 |
+
|
14 |
+
# Load documentation from urls
|
15 |
+
def load_docs():
|
16 |
+
|
17 |
+
# Get urls
|
18 |
+
urlsfile = open("urls.txt")
|
19 |
+
urls = urlsfile.readlines()
|
20 |
+
urls = [url.replace("\n","") for url in urls]
|
21 |
+
urlsfile.close()
|
22 |
+
|
23 |
+
# Load, chunk and index the contents of the blog.
|
24 |
+
loader = WebBaseLoader(urls)
|
25 |
+
docs = loader.load()
|
26 |
+
|
27 |
+
return docs
|
28 |
+
|
29 |
+
# Join content pages for processing
|
30 |
+
def format_docs(docs):
|
31 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
32 |
+
|
33 |
+
# Create a RAG chain
|
34 |
+
def RAG(llm, docs, embeddings):
|
35 |
+
|
36 |
+
# Split text
|
37 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
38 |
+
splits = text_splitter.split_documents(docs)
|
39 |
+
|
40 |
+
# Create vector store
|
41 |
+
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
|
42 |
+
|
43 |
+
# Retrieve and generate using the relevant snippets of the documents
|
44 |
+
retriever = vectorstore.as_retriever()
|
45 |
+
|
46 |
+
# Prompt basis example for RAG systems
|
47 |
+
prompt = hub.pull("rlm/rag-prompt")
|
48 |
+
|
49 |
+
# Create the chain
|
50 |
+
rag_chain = (
|
51 |
+
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
52 |
+
| prompt
|
53 |
+
| llm
|
54 |
+
| StrOutputParser()
|
55 |
+
)
|
56 |
+
|
57 |
+
return rag_chain
|
requirements.txt
CHANGED
@@ -1 +1,10 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
langchain-community
|
3 |
+
langchain-chroma
|
4 |
+
langchain-mistralai
|
5 |
+
beautifulsoup4
|
6 |
+
pypdf==5.0.1
|
7 |
+
sentence-transformers==2.2.2
|
8 |
+
huggingface_hub==0.25.2
|
9 |
+
InstructorEmbedding
|
10 |
+
openai
|
urls.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
https://github.com/JayWadekar/gwIAS-HM/tree/main
|
2 |
+
https://github.com/JayWadekar/gwIAS-HM/tree/main/Pipeline
|
3 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/README.md
|
4 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/ML_modules.py
|
5 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coherent_score_hm_search.py
|
6 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coherent_score_mz_fast.py
|
7 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coincidence_HM.py
|
8 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/data_operations.py
|
9 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/download_data.py
|
10 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/gw_detect_file.py
|
11 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/params.py
|
12 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/python_utils.py
|
13 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/ranking_HM.py
|
14 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/readligo.py
|
15 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_generator_HM.py
|
16 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_params_O3a_HM.py
|
17 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggering_on_cluster.py
|
18 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggers_single_detector_HM.py
|
19 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/utils.py
|