IAMTFRMZA commited on
Commit
5686026
·
verified ·
1 Parent(s): 3432a8c
Files changed (1) hide show
  1. app.py +20 -25
app.py CHANGED
@@ -3,7 +3,6 @@ import shutil
3
  import streamlit as st
4
  import requests
5
  from bs4 import BeautifulSoup
6
- import pandas as pd
7
  from langchain_core.prompts import ChatPromptTemplate
8
  from langchain_core.output_parsers import StrOutputParser
9
  from langchain_core.runnables import RunnablePassthrough
@@ -15,16 +14,14 @@ from langchain_community.document_loaders import UnstructuredExcelLoader
15
  from langchain.text_splitter import CharacterTextSplitter
16
  from langchain.embeddings import HuggingFaceEmbeddings
17
 
18
- # Set API key environment variable
19
  os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY")
20
 
21
-
22
  def inference(chain, input_query):
23
  """Invoke the processing chain with the input query."""
24
  result = chain.invoke(input_query)
25
  return result
26
 
27
-
28
  def create_chain(retriever, prompt, model):
29
  """Compose the processing chain with the specified components."""
30
  chain = (
@@ -35,7 +32,6 @@ def create_chain(retriever, prompt, model):
35
  )
36
  return chain
37
 
38
-
39
  def generate_prompt():
40
  """Define the prompt template for question answering."""
41
  template = """<s>[INST] Answer the question in a simple sentence based only on the following context:
@@ -44,7 +40,6 @@ def generate_prompt():
44
  """
45
  return ChatPromptTemplate.from_template(template)
46
 
47
-
48
  def configure_model():
49
  """Configure the language model with specified parameters."""
50
  return Together(
@@ -56,14 +51,12 @@ def configure_model():
56
  repetition_penalty=1.1,
57
  )
58
 
59
-
60
  def configure_retriever(documents):
61
  """Configure the retriever with embeddings and a FAISS vector store."""
62
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
63
  vector_db = FAISS.from_documents(documents, embeddings)
64
  return vector_db.as_retriever()
65
 
66
-
67
  def load_pdf_documents(path):
68
  """Load and preprocess PDF documents from the specified path."""
69
  documents = []
@@ -74,7 +67,6 @@ def load_pdf_documents(path):
74
  documents.extend(loader.load())
75
  return documents
76
 
77
-
78
  def load_word_documents(path):
79
  """Load and preprocess Word documents from the specified path."""
80
  documents = []
@@ -85,7 +77,6 @@ def load_word_documents(path):
85
  documents.extend(loader.load())
86
  return documents
87
 
88
-
89
  def load_excel_documents(path):
90
  """Load and preprocess Excel documents from the specified path."""
91
  documents = []
@@ -96,7 +87,6 @@ def load_excel_documents(path):
96
  documents.extend(loader.load())
97
  return documents
98
 
99
-
100
  def load_documents(path):
101
  """Load and preprocess documents from PDF, Word, and Excel files."""
102
  pdf_docs = load_pdf_documents(path)
@@ -104,7 +94,6 @@ def load_documents(path):
104
  excel_docs = load_excel_documents(path)
105
  return pdf_docs + word_docs + excel_docs
106
 
107
-
108
  def scrape_url(url):
109
  """Scrape content from a given URL and save it to a text file."""
110
  try:
@@ -112,10 +101,6 @@ def scrape_url(url):
112
  response.raise_for_status() # Ensure we notice bad responses
113
  soup = BeautifulSoup(response.content, 'html.parser')
114
  text = soup.get_text()
115
-
116
- # Ensure the data directory exists
117
- os.makedirs("data", exist_ok=True)
118
-
119
  # Save the text content to a file for processing
120
  text_file_path = "data/scraped_content.txt"
121
  with open(text_file_path, "w") as file:
@@ -124,16 +109,22 @@ def scrape_url(url):
124
  except requests.RequestException as e:
125
  st.error(f"Error fetching the URL: {e}")
126
  return None
127
- except Exception as e:
128
- st.error(f"An unexpected error occurred: {e}")
129
- return None
130
-
131
 
132
  def process_document(path, input_query):
133
  """Process the document by setting up the chain and invoking it with the input query."""
134
  documents = load_documents(path)
 
 
 
 
 
135
  text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
136
  split_docs = text_splitter.split_documents(documents)
 
 
 
 
 
137
  llm_model = configure_model()
138
  prompt = generate_prompt()
139
  retriever = configure_retriever(split_docs)
@@ -141,7 +132,6 @@ def process_document(path, input_query):
141
  response = inference(chain, input_query)
142
  return response
143
 
144
-
145
  def main():
146
  """Main function to run the Streamlit app."""
147
  tmp_folder = '/tmp/1'
@@ -164,7 +154,8 @@ def main():
164
  user_query = st.text_input("Ask a question:", key="query_input")
165
  if st.form_submit_button("Ask") and user_query:
166
  response = process_document(tmp_folder, user_query)
167
- st.session_state.chat_history.append({"question": user_query, "answer": response})
 
168
 
169
  if st.button("Clear Chat History"):
170
  st.session_state.chat_history = []
@@ -182,9 +173,13 @@ def main():
182
  file_path = scrape_url(url_input)
183
  if file_path:
184
  documents = load_documents(tmp_folder)
185
- response = process_document(tmp_folder, "What is the content of the URL?")
186
- st.session_state.chat_history.append({"question": "What is the content of the URL?", "answer": response})
187
- st.success("URL content processed successfully!")
 
 
 
 
188
  else:
189
  st.error("Failed to process URL content.")
190
  else:
 
3
  import streamlit as st
4
  import requests
5
  from bs4 import BeautifulSoup
 
6
  from langchain_core.prompts import ChatPromptTemplate
7
  from langchain_core.output_parsers import StrOutputParser
8
  from langchain_core.runnables import RunnablePassthrough
 
14
  from langchain.text_splitter import CharacterTextSplitter
15
  from langchain.embeddings import HuggingFaceEmbeddings
16
 
17
+ # Set API key
18
  os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY")
19
 
 
20
  def inference(chain, input_query):
21
  """Invoke the processing chain with the input query."""
22
  result = chain.invoke(input_query)
23
  return result
24
 
 
25
  def create_chain(retriever, prompt, model):
26
  """Compose the processing chain with the specified components."""
27
  chain = (
 
32
  )
33
  return chain
34
 
 
35
  def generate_prompt():
36
  """Define the prompt template for question answering."""
37
  template = """<s>[INST] Answer the question in a simple sentence based only on the following context:
 
40
  """
41
  return ChatPromptTemplate.from_template(template)
42
 
 
43
  def configure_model():
44
  """Configure the language model with specified parameters."""
45
  return Together(
 
51
  repetition_penalty=1.1,
52
  )
53
 
 
54
  def configure_retriever(documents):
55
  """Configure the retriever with embeddings and a FAISS vector store."""
56
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
57
  vector_db = FAISS.from_documents(documents, embeddings)
58
  return vector_db.as_retriever()
59
 
 
60
  def load_pdf_documents(path):
61
  """Load and preprocess PDF documents from the specified path."""
62
  documents = []
 
67
  documents.extend(loader.load())
68
  return documents
69
 
 
70
  def load_word_documents(path):
71
  """Load and preprocess Word documents from the specified path."""
72
  documents = []
 
77
  documents.extend(loader.load())
78
  return documents
79
 
 
80
  def load_excel_documents(path):
81
  """Load and preprocess Excel documents from the specified path."""
82
  documents = []
 
87
  documents.extend(loader.load())
88
  return documents
89
 
 
90
  def load_documents(path):
91
  """Load and preprocess documents from PDF, Word, and Excel files."""
92
  pdf_docs = load_pdf_documents(path)
 
94
  excel_docs = load_excel_documents(path)
95
  return pdf_docs + word_docs + excel_docs
96
 
 
97
  def scrape_url(url):
98
  """Scrape content from a given URL and save it to a text file."""
99
  try:
 
101
  response.raise_for_status() # Ensure we notice bad responses
102
  soup = BeautifulSoup(response.content, 'html.parser')
103
  text = soup.get_text()
 
 
 
 
104
  # Save the text content to a file for processing
105
  text_file_path = "data/scraped_content.txt"
106
  with open(text_file_path, "w") as file:
 
109
  except requests.RequestException as e:
110
  st.error(f"Error fetching the URL: {e}")
111
  return None
 
 
 
 
112
 
113
  def process_document(path, input_query):
114
  """Process the document by setting up the chain and invoking it with the input query."""
115
  documents = load_documents(path)
116
+
117
+ if not documents:
118
+ st.error("No documents found. Please check the uploaded files or scraped content.")
119
+ return "No documents found."
120
+
121
  text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
122
  split_docs = text_splitter.split_documents(documents)
123
+
124
+ if not split_docs:
125
+ st.error("No text could be extracted from the documents.")
126
+ return "No text could be extracted."
127
+
128
  llm_model = configure_model()
129
  prompt = generate_prompt()
130
  retriever = configure_retriever(split_docs)
 
132
  response = inference(chain, input_query)
133
  return response
134
 
 
135
  def main():
136
  """Main function to run the Streamlit app."""
137
  tmp_folder = '/tmp/1'
 
154
  user_query = st.text_input("Ask a question:", key="query_input")
155
  if st.form_submit_button("Ask") and user_query:
156
  response = process_document(tmp_folder, user_query)
157
+ if response: # Check if response is not empty
158
+ st.session_state.chat_history.append({"question": user_query, "answer": response})
159
 
160
  if st.button("Clear Chat History"):
161
  st.session_state.chat_history = []
 
173
  file_path = scrape_url(url_input)
174
  if file_path:
175
  documents = load_documents(tmp_folder)
176
+ if documents: # Check if documents are loaded after scraping
177
+ response = process_document(tmp_folder, "What is the content of the URL?")
178
+ if response: # Check if response is not empty
179
+ st.session_state.chat_history.append({"question": "What is the content of the URL?", "answer": response})
180
+ st.success("URL content processed successfully!")
181
+ else:
182
+ st.error("Failed to load any documents from the scraped URL content.")
183
  else:
184
  st.error("Failed to process URL content.")
185
  else: