ali121300 commited on
Commit
9a66b4f
1 Parent(s): a66bea3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -176
app.py CHANGED
@@ -1,202 +1,66 @@
1
- import os
2
- import streamlit as st
3
- from dotenv import load_dotenv
4
- from PyPDF2 import PdfReader
5
- from langchain.text_splitter import CharacterTextSplitter
6
- from langchain.embeddings import HuggingFaceBgeEmbeddings
7
  from langchain.vectorstores import FAISS
8
- from langchain.chat_models import ChatOpenAI
9
- from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
- from htmlTemplates import css, bot_template, user_template
12
- from langchain.llms import HuggingFaceHub
13
- from deep_translator import GoogleTranslator
14
- import pandas as pd
15
- from langchain_groq import ChatGroq
16
- from openai import OpenAI
17
  from langchain.chat_models import ChatOpenAI
18
- # set this key as an environment variable
19
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets['Key2']
20
- os.environ["OPENAI_API_KEY"] =st.secrets['Key3']
21
- from langchain.llms import LlamaCpp
22
- from langchain import PromptTemplate, LLMChain
23
- from langchain.callbacks.manager import CallbackManager
24
- from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
25
-
26
  ###########################################################################################
27
 
28
- def get_pdf_text(pdf_docs : list) -> str:
29
- text = ""
30
- for pdf in pdf_docs:
31
- pdf_reader = PdfReader(pdf)
32
- for page in pdf_reader.pages:
33
- text += page.extract_text()
34
- return text
35
- #######################################################################################
36
- def load_file():
37
- loader = TextLoader('d2.txt')
38
- documents = loader.load()
39
- return documents
40
  ########################################################################################
41
- def get_text_chunks(text:str) ->list:
42
- text_splitter = CharacterTextSplitter(
43
- separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len
44
- )
45
- chunks = text_splitter.split_text(text)
46
- return chunks
47
-
48
-
49
- def get_vectorstore(text_chunks : list) -> FAISS:
50
- #model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
51
- model="Avditvs/multilingual-e5-small-distill-base-0.1"
52
- encode_kwargs = {
53
- "normalize_embeddings": True
54
- } # set True to compute cosine similarity
55
- embeddings = HuggingFaceBgeEmbeddings(
56
- model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
57
- )
58
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
59
- return vectorstore
60
-
61
-
62
- def get_conversation_chain(vectorstore:FAISS) -> ConversationalRetrievalChain:
63
- # llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
64
- llm = HuggingFaceHub(
65
- #repo_id="mistralai/Mistral-7B-Instruct-v0.2",
66
- repo_id="google/gemma-1.1-7b-it",
67
- #repo_id="TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
68
- model_kwargs={"temperature": 0.5, "max_length": 2048},
69
- )
70
-
71
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
72
- conversation_chain = ConversationalRetrievalChain.from_llm(
73
- llm=llm, retriever=vectorstore.as_retriever(), memory=memory
74
- )
75
- return conversation_chain
76
-
77
-
78
-
79
- def handle_userinput(user_question:str):
80
- response = st.session_state.conversation({"question": user_question})
81
- st.session_state.chat_history = response["chat_history"]
82
-
83
- for i, message in enumerate(st.session_state.chat_history):
84
- if i % 2 == 0:
85
- text2=message.content
86
- translator = GoogleTranslator(source='english', target='persian')
87
- result = translator.translate(text2)
88
- st.write("سوال کاربر: "+result)
89
- else:
90
- text1=message.content
91
- translator = GoogleTranslator(source='english', target='persian')
92
- result = translator.translate(text1)
93
- st.write("پاسخ ربات: "+result)
94
-
95
- #############################################################################################################
96
- def read_pdf_pr_en(pdf_file_path):
97
- from deep_translator import GoogleTranslator
98
- import PyPDF2
99
- # مسیر فایل PDF را تعیین کنید
100
- #pdf_file_path = '/content/d2en.pdf'
101
- # باز کردن فایل PDF
102
- with open(pdf_file_path, 'rb') as pdf_file:
103
- pdf_reader = PyPDF2.PdfReader(pdf_file)
104
- # خواندن محتوای صفحه‌ها
105
- full_text = ''
106
- for page in pdf_reader.pages:
107
- page_pdf=page.extract_text()
108
- translator = GoogleTranslator(source='persian', target='english')
109
- result = translator.translate(page_pdf)
110
- full_text +=result
111
- st.write(full_text)
112
- return(full_text)
113
- #################################################################################################################
114
- def get_pdf_text(pdf_docs):
115
- text = ""
116
- for pdf in pdf_docs:
117
- pdf_reader = PdfReader(pdf)
118
- for page in pdf_reader.pages:
119
- txt_page=page.extract_text()
120
- text += txt_page
121
- return text
122
- #######################################################################################################################
123
- def upload_xls():
124
- st.title("آپلود و نمایش فایل اکسل")
125
- uploaded_file = st.file_uploader("لطفاً فایل اکسل خود را آپلود کنید", type=["xlsx", "xls"])
126
- if uploaded_file is not None:
127
- df = pd.read_excel(uploaded_file)
128
- st.write("دیتا فریم مربوط به فایل اکسل:")
129
- st.write(df)
130
- return df
131
-
132
- ################################################################################################################
133
- def sentences_f(sentence,df2):
134
- words = sentence.split()
135
- df1 = pd.DataFrame(words, columns=['کلمات'])
136
- df1['معادل'] = ''
137
- for i, word in df1['کلمات'].items():
138
- match = df2[df2['کلمات'] == word]
139
- if not match.empty:
140
- df1.at[i, 'معادل'] = match['معادل'].values[0]
141
- df1['معادل'] = df1.apply(lambda row: row['کلمات'] if row['معادل'] == '' else row['معادل'], axis=1)
142
- translated_sentence = ' '.join(df1['معادل'].tolist())
143
- return translated_sentence
144
- ####################################################################################################################
145
-
146
  ####################################################################################################################
147
  def main():
148
  st.set_page_config(
149
  page_title="Chat Bot PDFs",
150
  page_icon=":books:",
151
  )
152
-
153
- #st.markdown("# Chat with a Bot")
154
- #st.markdown("This bot tries to answer questions about multiple PDFs. Let the processing of the PDF finish before adding your question. 🙏🏾")
155
-
156
- st.write(css, unsafe_allow_html=True)
157
- #df2=upload_xls()
158
-
159
-
160
- if "conversation" not in st.session_state:
161
- st.session_state.conversation = None
162
- if "chat_history" not in st.session_state:
163
- st.session_state.chat_history = None
164
-
165
 
166
  st.header("Chat Bot PDFs :books:")
167
  user_question = st.text_input("Ask a question about your documents:")
168
- #user_question2=sentences_f(sentence=user_question1,df2=df2)
169
- #translator = GoogleTranslator(source='persian', target='english')
170
- #user_question = translator.translate(user_question2)
 
 
 
 
 
171
  if st.button("Answer"):
172
  with st.spinner("Answering"):
173
- handle_userinput(user_question)
174
 
175
  if st.button("CLEAR"):
176
  with st.spinner("CLEARING"):
177
  st.cache_data.clear()
178
 
179
 
180
- with st.sidebar:
181
- st.subheader("Your documents")
182
- pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
183
-
184
- if st.button("Process"):
185
- with st.spinner("Processing"):
186
- # get pdf text
187
- raw_text = get_pdf_text(pdf_docs)
188
-
189
- # get the text chunks
190
- text_chunks = get_text_chunks(raw_text)
191
-
192
- # create vector store
193
- vectorstore = get_vectorstore(text_chunks)
194
-
195
- # create conversation chain
196
- st.session_state.conversation = get_conversation_chain(vectorstore)
197
-
198
- #compelete build model
199
- st.write("compelete build model")
200
 
201
 
202
  if __name__ == "__main__":
 
1
+ from langchain.document_loaders import PyPDFDirectoryLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
3
  from langchain.vectorstores import FAISS
4
+ from langchain.llms import openai
 
5
  from langchain.chains import ConversationalRetrievalChain
 
 
 
 
 
 
6
  from langchain.chat_models import ChatOpenAI
7
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
 
 
 
 
 
 
 
8
  ###########################################################################################
9
 
10
+ def get_pdf_load():
11
+ loader=PyPDFDirectoryLoader("./data")
12
+ document=loader.load()
13
+ return document
 
 
 
 
 
 
 
 
14
  ########################################################################################
15
+ def get_text_split(document):
16
+ text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
17
+ texts =text_splitter.split_documents(document)
18
+ return texts
19
+ #########################################################################################
20
+ def get_vectorstore(texts):
21
+ #Vector and Embeddings
22
+ DB_FAISS_PATH = 'vectore_Imstudio/faiss'
23
+ #Vector and Embeddings
24
+ embeddings= HuggingFaceBgeEmbeddings(model_name='Avditvs/multilingual-e5-small-distill-base-0.1', model_kwargs={'device': 'cpu'})
25
+ db= FAISS.from_documents(texts,embeddings)
26
+ db.save_local(DB_FAISS_PATH)
27
+ return db
28
+ ############################################################################################
29
+ def get_chain(db):
30
+ llm=ChatOpenAI(base_url="https://bd4c-85-9-86-142.ngrok-free.app/v1", api_key="lm-studio",temperature=0.1,model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF")
31
+ #Build a chain
32
+ qa_chain = ConversationalRetrievalChain.from_llm(
33
+ llm,db.as_retriever (search_kwargs={'k':2}),return_source_documents=True)
34
+ return qa_chain
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  ####################################################################################################################
36
  def main():
37
  st.set_page_config(
38
  page_title="Chat Bot PDFs",
39
  page_icon=":books:",
40
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  st.header("Chat Bot PDFs :books:")
43
  user_question = st.text_input("Ask a question about your documents:")
44
+ if st.button("Build Model"):
45
+ with st.spinner("Waiting"):
46
+ document=get_pdf_load()
47
+ texts=et_text_split(document)
48
+ db=get_vectorstore(texts)
49
+ qa_chain=get_chain(db)
50
+ st.write("compelete build model")
51
+
52
  if st.button("Answer"):
53
  with st.spinner("Answering"):
54
+
55
 
56
  if st.button("CLEAR"):
57
  with st.spinner("CLEARING"):
58
  st.cache_data.clear()
59
 
60
 
61
+ #with st.sidebar:
62
+ #if st.button("Process build model"):
63
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
 
66
  if __name__ == "__main__":