JayWadekar commited on
Commit
6855cb4
·
1 Parent(s): b3882e1

First commit

Browse files
Files changed (9) hide show
  1. README.md +8 -7
  2. app.py +59 -51
  3. cookies.txt +4 -0
  4. gitattributes +35 -0
  5. ims/gwIASlogo.jpg +0 -0
  6. ims/userpic.png +0 -0
  7. rag.py +57 -0
  8. requirements.txt +10 -1
  9. urls.txt +19 -0
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: GwIAS
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.0.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: LLM for the IAS-HM gravitational wave search pipeline
 
11
  ---
12
 
13
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
1
  ---
2
+ title: CAMELSDocBot
3
+ emoji: 🐨
4
+ colorFrom: pink
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.5.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ short_description: Chatbot assistant for the CAMELS simulations documentation
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,64 +1,72 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
 
27
 
28
- response = ""
 
 
 
 
 
 
 
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
41
 
 
 
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- if __name__ == "__main__":
64
  demo.launch()
 
 
 
 
 
1
+ # AI assistant with a RAG system to query information from
2
+ # the gwIAS search pipline
3
+ # using Langchain and deployed with Gradio
4
+
5
+ # Thanks to Pablo Villanueva Domingo for sharing his CAMELS template
6
+ # https://huggingface.co/spaces/PabloVD/CAMELSDocBot
7
 
8
+ from rag import RAG, load_docs
9
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
10
+ from langchain.chat_models import ChatOpenAI
11
+ import gradio as gr
12
 
13
+ # Load the documentation
14
+ docs = load_docs()
15
+ print("Pages loaded:", len(docs))
16
 
17
+ # LLM model
18
+ llm = ChatOpenAI(model="gpt-4o-mini")
 
 
 
 
 
 
 
19
 
20
+ # Embeddings
21
+ embed_model = "sentence-transformers/multi-qa-distilbert-cos-v1"
22
+ # embed_model = "nvidia/NV-Embed-v2"
23
+ embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model)
 
24
 
25
+ # RAG chain
26
+ rag_chain = RAG(llm, docs, embeddings)
27
 
28
+ # Function to handle prompt and query the RAG chain
29
+ def handle_prompt(message, history):
30
+ try:
31
+ # Stream output
32
+ out = ""
33
+ for chunk in rag_chain.stream(message):
34
+ out += chunk
35
+ yield out
36
+ except Exception as e:
37
+ raise gr.Error(f"An error occurred: {str(e)}")
38
 
 
 
 
 
 
 
 
 
39
 
40
+ if __name__ == "__main__":
 
41
 
42
+ # Predefined messages and examples
43
+ description = "AI powered assistant to help with [gwIAS](https://github.com/JayWadekar/gwIAS-HM) gravitational wave search pipeline."
44
+ greetingsmessage = "Hi, I'm the gwIAS Bot, I'm here to assist you with the search pipeline."
45
+ example_questions = [
46
+ "Can you give me the code for calculating coherent score?",
47
+ "Which module in the code is used for collecting coincident triggers?",
48
+ "How are template banks constructed?"
49
+ ]
50
 
51
+ # Define customized Gradio chatbot
52
+ chatbot = gr.Chatbot([{"role": "assistant", "content": greetingsmessage}],
53
+ type="messages",
54
+ avatar_images=["ims/userpic.png", "ims/gwIASlogo.jpg"],
55
+ height="60vh")
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Define Gradio interface
58
+ demo = gr.ChatInterface(handle_prompt,
59
+ type="messages",
60
+ title="gwIAS DocBot",
61
+ fill_height=True,
62
+ examples=example_questions,
63
+ theme=gr.themes.Soft(),
64
+ description=description,
65
+ # cache_examples=False,
66
+ chatbot=chatbot)
67
 
 
68
  demo.launch()
69
+
70
+ # https://arxiv.org/html/2405.17400v2
71
+ # https://arxiv.org/html/2312.06631v1
72
+ # https://arxiv.org/html/2310.15233v2
cookies.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Netscape HTTP Cookie File
2
+ # https://curl.haxx.se/docs/http-cookies.html
3
+ # This file was generated by libcurl! Edit at your own risk.
4
+
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
ims/gwIASlogo.jpg ADDED
ims/userpic.png ADDED
rag.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utilities to build a RAG system to query information from the
2
+ # gwIAS search pipeline using Langchain
3
+
4
+ # Thanks to Pablo Villanueva Domingo for sharing his CAMELS template
5
+ # https://huggingface.co/spaces/PabloVD/CAMELSDocBot
6
+
7
+ from langchain import hub
8
+ from langchain_chroma import Chroma
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_core.runnables import RunnablePassthrough
11
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
12
+ from langchain_community.document_loaders import WebBaseLoader
13
+
14
+ # Load documentation from urls
15
+ def load_docs():
16
+
17
+ # Get urls
18
+ urlsfile = open("urls.txt")
19
+ urls = urlsfile.readlines()
20
+ urls = [url.replace("\n","") for url in urls]
21
+ urlsfile.close()
22
+
23
+ # Load, chunk and index the contents of the blog.
24
+ loader = WebBaseLoader(urls)
25
+ docs = loader.load()
26
+
27
+ return docs
28
+
29
+ # Join content pages for processing
30
+ def format_docs(docs):
31
+ return "\n\n".join(doc.page_content for doc in docs)
32
+
33
+ # Create a RAG chain
34
+ def RAG(llm, docs, embeddings):
35
+
36
+ # Split text
37
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
38
+ splits = text_splitter.split_documents(docs)
39
+
40
+ # Create vector store
41
+ vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
42
+
43
+ # Retrieve and generate using the relevant snippets of the documents
44
+ retriever = vectorstore.as_retriever()
45
+
46
+ # Prompt basis example for RAG systems
47
+ prompt = hub.pull("rlm/rag-prompt")
48
+
49
+ # Create the chain
50
+ rag_chain = (
51
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
52
+ | prompt
53
+ | llm
54
+ | StrOutputParser()
55
+ )
56
+
57
+ return rag_chain
requirements.txt CHANGED
@@ -1 +1,10 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchain-chroma
4
+ langchain-mistralai
5
+ beautifulsoup4
6
+ pypdf==5.0.1
7
+ sentence-transformers==2.2.2
8
+ huggingface_hub==0.25.2
9
+ InstructorEmbedding
10
+ openai
urls.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://github.com/JayWadekar/gwIAS-HM/tree/main
2
+ https://github.com/JayWadekar/gwIAS-HM/tree/main/Pipeline
3
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/README.md
4
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/ML_modules.py
5
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coherent_score_hm_search.py
6
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coherent_score_mz_fast.py
7
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coincidence_HM.py
8
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/data_operations.py
9
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/download_data.py
10
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/gw_detect_file.py
11
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/params.py
12
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/python_utils.py
13
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/ranking_HM.py
14
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/readligo.py
15
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_generator_HM.py
16
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_params_O3a_HM.py
17
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggering_on_cluster.py
18
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggers_single_detector_HM.py
19
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/utils.py