Spaces:
Sleeping
Sleeping
app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ import shutil
|
|
3 |
import streamlit as st
|
4 |
import requests
|
5 |
from bs4 import BeautifulSoup
|
6 |
-
import pandas as pd
|
7 |
from langchain_core.prompts import ChatPromptTemplate
|
8 |
from langchain_core.output_parsers import StrOutputParser
|
9 |
from langchain_core.runnables import RunnablePassthrough
|
@@ -15,16 +14,14 @@ from langchain_community.document_loaders import UnstructuredExcelLoader
|
|
15 |
from langchain.text_splitter import CharacterTextSplitter
|
16 |
from langchain.embeddings import HuggingFaceEmbeddings
|
17 |
|
18 |
-
# Set API key
|
19 |
os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY")
|
20 |
|
21 |
-
|
22 |
def inference(chain, input_query):
|
23 |
"""Invoke the processing chain with the input query."""
|
24 |
result = chain.invoke(input_query)
|
25 |
return result
|
26 |
|
27 |
-
|
28 |
def create_chain(retriever, prompt, model):
|
29 |
"""Compose the processing chain with the specified components."""
|
30 |
chain = (
|
@@ -35,7 +32,6 @@ def create_chain(retriever, prompt, model):
|
|
35 |
)
|
36 |
return chain
|
37 |
|
38 |
-
|
39 |
def generate_prompt():
|
40 |
"""Define the prompt template for question answering."""
|
41 |
template = """<s>[INST] Answer the question in a simple sentence based only on the following context:
|
@@ -44,7 +40,6 @@ def generate_prompt():
|
|
44 |
"""
|
45 |
return ChatPromptTemplate.from_template(template)
|
46 |
|
47 |
-
|
48 |
def configure_model():
|
49 |
"""Configure the language model with specified parameters."""
|
50 |
return Together(
|
@@ -56,14 +51,12 @@ def configure_model():
|
|
56 |
repetition_penalty=1.1,
|
57 |
)
|
58 |
|
59 |
-
|
60 |
def configure_retriever(documents):
|
61 |
"""Configure the retriever with embeddings and a FAISS vector store."""
|
62 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
63 |
vector_db = FAISS.from_documents(documents, embeddings)
|
64 |
return vector_db.as_retriever()
|
65 |
|
66 |
-
|
67 |
def load_pdf_documents(path):
|
68 |
"""Load and preprocess PDF documents from the specified path."""
|
69 |
documents = []
|
@@ -74,7 +67,6 @@ def load_pdf_documents(path):
|
|
74 |
documents.extend(loader.load())
|
75 |
return documents
|
76 |
|
77 |
-
|
78 |
def load_word_documents(path):
|
79 |
"""Load and preprocess Word documents from the specified path."""
|
80 |
documents = []
|
@@ -85,7 +77,6 @@ def load_word_documents(path):
|
|
85 |
documents.extend(loader.load())
|
86 |
return documents
|
87 |
|
88 |
-
|
89 |
def load_excel_documents(path):
|
90 |
"""Load and preprocess Excel documents from the specified path."""
|
91 |
documents = []
|
@@ -96,7 +87,6 @@ def load_excel_documents(path):
|
|
96 |
documents.extend(loader.load())
|
97 |
return documents
|
98 |
|
99 |
-
|
100 |
def load_documents(path):
|
101 |
"""Load and preprocess documents from PDF, Word, and Excel files."""
|
102 |
pdf_docs = load_pdf_documents(path)
|
@@ -104,7 +94,6 @@ def load_documents(path):
|
|
104 |
excel_docs = load_excel_documents(path)
|
105 |
return pdf_docs + word_docs + excel_docs
|
106 |
|
107 |
-
|
108 |
def scrape_url(url):
|
109 |
"""Scrape content from a given URL and save it to a text file."""
|
110 |
try:
|
@@ -112,10 +101,6 @@ def scrape_url(url):
|
|
112 |
response.raise_for_status() # Ensure we notice bad responses
|
113 |
soup = BeautifulSoup(response.content, 'html.parser')
|
114 |
text = soup.get_text()
|
115 |
-
|
116 |
-
# Ensure the data directory exists
|
117 |
-
os.makedirs("data", exist_ok=True)
|
118 |
-
|
119 |
# Save the text content to a file for processing
|
120 |
text_file_path = "data/scraped_content.txt"
|
121 |
with open(text_file_path, "w") as file:
|
@@ -124,16 +109,22 @@ def scrape_url(url):
|
|
124 |
except requests.RequestException as e:
|
125 |
st.error(f"Error fetching the URL: {e}")
|
126 |
return None
|
127 |
-
except Exception as e:
|
128 |
-
st.error(f"An unexpected error occurred: {e}")
|
129 |
-
return None
|
130 |
-
|
131 |
|
132 |
def process_document(path, input_query):
|
133 |
"""Process the document by setting up the chain and invoking it with the input query."""
|
134 |
documents = load_documents(path)
|
|
|
|
|
|
|
|
|
|
|
135 |
text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
|
136 |
split_docs = text_splitter.split_documents(documents)
|
|
|
|
|
|
|
|
|
|
|
137 |
llm_model = configure_model()
|
138 |
prompt = generate_prompt()
|
139 |
retriever = configure_retriever(split_docs)
|
@@ -141,7 +132,6 @@ def process_document(path, input_query):
|
|
141 |
response = inference(chain, input_query)
|
142 |
return response
|
143 |
|
144 |
-
|
145 |
def main():
|
146 |
"""Main function to run the Streamlit app."""
|
147 |
tmp_folder = '/tmp/1'
|
@@ -164,7 +154,8 @@ def main():
|
|
164 |
user_query = st.text_input("Ask a question:", key="query_input")
|
165 |
if st.form_submit_button("Ask") and user_query:
|
166 |
response = process_document(tmp_folder, user_query)
|
167 |
-
|
|
|
168 |
|
169 |
if st.button("Clear Chat History"):
|
170 |
st.session_state.chat_history = []
|
@@ -182,9 +173,13 @@ def main():
|
|
182 |
file_path = scrape_url(url_input)
|
183 |
if file_path:
|
184 |
documents = load_documents(tmp_folder)
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
188 |
else:
|
189 |
st.error("Failed to process URL content.")
|
190 |
else:
|
|
|
3 |
import streamlit as st
|
4 |
import requests
|
5 |
from bs4 import BeautifulSoup
|
|
|
6 |
from langchain_core.prompts import ChatPromptTemplate
|
7 |
from langchain_core.output_parsers import StrOutputParser
|
8 |
from langchain_core.runnables import RunnablePassthrough
|
|
|
14 |
from langchain.text_splitter import CharacterTextSplitter
|
15 |
from langchain.embeddings import HuggingFaceEmbeddings
|
16 |
|
17 |
+
# Set API key
|
18 |
os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY")
|
19 |
|
|
|
20 |
def inference(chain, input_query):
|
21 |
"""Invoke the processing chain with the input query."""
|
22 |
result = chain.invoke(input_query)
|
23 |
return result
|
24 |
|
|
|
25 |
def create_chain(retriever, prompt, model):
|
26 |
"""Compose the processing chain with the specified components."""
|
27 |
chain = (
|
|
|
32 |
)
|
33 |
return chain
|
34 |
|
|
|
35 |
def generate_prompt():
|
36 |
"""Define the prompt template for question answering."""
|
37 |
template = """<s>[INST] Answer the question in a simple sentence based only on the following context:
|
|
|
40 |
"""
|
41 |
return ChatPromptTemplate.from_template(template)
|
42 |
|
|
|
43 |
def configure_model():
|
44 |
"""Configure the language model with specified parameters."""
|
45 |
return Together(
|
|
|
51 |
repetition_penalty=1.1,
|
52 |
)
|
53 |
|
|
|
54 |
def configure_retriever(documents):
|
55 |
"""Configure the retriever with embeddings and a FAISS vector store."""
|
56 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
57 |
vector_db = FAISS.from_documents(documents, embeddings)
|
58 |
return vector_db.as_retriever()
|
59 |
|
|
|
60 |
def load_pdf_documents(path):
|
61 |
"""Load and preprocess PDF documents from the specified path."""
|
62 |
documents = []
|
|
|
67 |
documents.extend(loader.load())
|
68 |
return documents
|
69 |
|
|
|
70 |
def load_word_documents(path):
|
71 |
"""Load and preprocess Word documents from the specified path."""
|
72 |
documents = []
|
|
|
77 |
documents.extend(loader.load())
|
78 |
return documents
|
79 |
|
|
|
80 |
def load_excel_documents(path):
|
81 |
"""Load and preprocess Excel documents from the specified path."""
|
82 |
documents = []
|
|
|
87 |
documents.extend(loader.load())
|
88 |
return documents
|
89 |
|
|
|
90 |
def load_documents(path):
|
91 |
"""Load and preprocess documents from PDF, Word, and Excel files."""
|
92 |
pdf_docs = load_pdf_documents(path)
|
|
|
94 |
excel_docs = load_excel_documents(path)
|
95 |
return pdf_docs + word_docs + excel_docs
|
96 |
|
|
|
97 |
def scrape_url(url):
|
98 |
"""Scrape content from a given URL and save it to a text file."""
|
99 |
try:
|
|
|
101 |
response.raise_for_status() # Ensure we notice bad responses
|
102 |
soup = BeautifulSoup(response.content, 'html.parser')
|
103 |
text = soup.get_text()
|
|
|
|
|
|
|
|
|
104 |
# Save the text content to a file for processing
|
105 |
text_file_path = "data/scraped_content.txt"
|
106 |
with open(text_file_path, "w") as file:
|
|
|
109 |
except requests.RequestException as e:
|
110 |
st.error(f"Error fetching the URL: {e}")
|
111 |
return None
|
|
|
|
|
|
|
|
|
112 |
|
113 |
def process_document(path, input_query):
|
114 |
"""Process the document by setting up the chain and invoking it with the input query."""
|
115 |
documents = load_documents(path)
|
116 |
+
|
117 |
+
if not documents:
|
118 |
+
st.error("No documents found. Please check the uploaded files or scraped content.")
|
119 |
+
return "No documents found."
|
120 |
+
|
121 |
text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
|
122 |
split_docs = text_splitter.split_documents(documents)
|
123 |
+
|
124 |
+
if not split_docs:
|
125 |
+
st.error("No text could be extracted from the documents.")
|
126 |
+
return "No text could be extracted."
|
127 |
+
|
128 |
llm_model = configure_model()
|
129 |
prompt = generate_prompt()
|
130 |
retriever = configure_retriever(split_docs)
|
|
|
132 |
response = inference(chain, input_query)
|
133 |
return response
|
134 |
|
|
|
135 |
def main():
|
136 |
"""Main function to run the Streamlit app."""
|
137 |
tmp_folder = '/tmp/1'
|
|
|
154 |
user_query = st.text_input("Ask a question:", key="query_input")
|
155 |
if st.form_submit_button("Ask") and user_query:
|
156 |
response = process_document(tmp_folder, user_query)
|
157 |
+
if response: # Check if response is not empty
|
158 |
+
st.session_state.chat_history.append({"question": user_query, "answer": response})
|
159 |
|
160 |
if st.button("Clear Chat History"):
|
161 |
st.session_state.chat_history = []
|
|
|
173 |
file_path = scrape_url(url_input)
|
174 |
if file_path:
|
175 |
documents = load_documents(tmp_folder)
|
176 |
+
if documents: # Check if documents are loaded after scraping
|
177 |
+
response = process_document(tmp_folder, "What is the content of the URL?")
|
178 |
+
if response: # Check if response is not empty
|
179 |
+
st.session_state.chat_history.append({"question": "What is the content of the URL?", "answer": response})
|
180 |
+
st.success("URL content processed successfully!")
|
181 |
+
else:
|
182 |
+
st.error("Failed to load any documents from the scraped URL content.")
|
183 |
else:
|
184 |
st.error("Failed to process URL content.")
|
185 |
else:
|