Create version/semapdf1.0.py
Browse files- version/semapdf1.0.py +105 -0
version/semapdf1.0.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains.question_answering import load_qa_chain
|
2 |
+
from langchain.llms import OpenAI
|
3 |
+
from langchain.text_splitter import CharacterTextSplitter
|
4 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
5 |
+
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
|
6 |
+
from langchain.callbacks import get_openai_callback
|
7 |
+
from PyPDF2 import PdfReader
|
8 |
+
import json
|
9 |
+
import openai
|
10 |
+
import streamlit as st
|
11 |
+
import os
|
12 |
+
import requests
|
13 |
+
|
14 |
+
|
15 |
+
# Page configuration
|
16 |
+
st.set_page_config(page_title="PesaQ", page_icon="💸", layout="wide",)
|
17 |
+
|
18 |
+
#set Open-AI key
|
19 |
+
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
|
20 |
+
|
21 |
+
|
22 |
+
# Sema Translator
|
23 |
+
def translate(userinput, target_lang, source_lang=None):
|
24 |
+
if source_lang:
|
25 |
+
url = "https://5d5c-44-208-85-154.ngrok-free.app/translate_enter/"
|
26 |
+
data = {
|
27 |
+
"userinput": userinput,
|
28 |
+
"source_lang": source_lang,
|
29 |
+
"target_lang": target_lang,
|
30 |
+
}
|
31 |
+
response = requests.post(url, json=data)
|
32 |
+
result = response.json()
|
33 |
+
print(type(result))
|
34 |
+
source_lange = source_lang
|
35 |
+
translation = result['translated_text']
|
36 |
+
return source_lange, translation
|
37 |
+
else:
|
38 |
+
url = "https://5d5c-44-208-85-154.ngrok-free.app/translate_detect/"
|
39 |
+
data = {
|
40 |
+
"userinput": userinput,
|
41 |
+
"target_lang": target_lang,
|
42 |
+
}
|
43 |
+
|
44 |
+
response = requests.post(url, json=data)
|
45 |
+
result = response.json()
|
46 |
+
source_lange = result['source_language']
|
47 |
+
translation = result['translated_text']
|
48 |
+
return source_lange, translation
|
49 |
+
|
50 |
+
|
51 |
+
def main():
|
52 |
+
st.title("📚 PesaDoc")
|
53 |
+
# upload file
|
54 |
+
pdf = st.file_uploader("Upload a financial Document and ask questions to get insights", type="pdf")
|
55 |
+
|
56 |
+
# extract the text
|
57 |
+
if pdf is not None:
|
58 |
+
reader = PdfReader(pdf)
|
59 |
+
pdf_text = ''
|
60 |
+
for page in (reader.pages):
|
61 |
+
text = page.extract_text()
|
62 |
+
if text:
|
63 |
+
pdf_text += text
|
64 |
+
|
65 |
+
# Define our text splitter
|
66 |
+
text_splitter = CharacterTextSplitter(
|
67 |
+
separator = "\n",
|
68 |
+
chunk_size = 1000, #thousand charctere
|
69 |
+
chunk_overlap = 200,
|
70 |
+
length_function = len,
|
71 |
+
)
|
72 |
+
#Apply splitting
|
73 |
+
text_chunks = text_splitter.split_text(pdf_text)
|
74 |
+
|
75 |
+
# Use embeddings from OpenAI
|
76 |
+
embeddings = OpenAIEmbeddings()
|
77 |
+
#Convert text to embeddings
|
78 |
+
pdf_embeddings = FAISS.from_texts(text_chunks, embeddings)
|
79 |
+
chain = load_qa_chain(OpenAI(), chain_type="stuff")
|
80 |
+
|
81 |
+
#user_question = st.text_input("Get insights into your finances ...")
|
82 |
+
# show user input
|
83 |
+
if "messages" not in st.session_state:
|
84 |
+
st.session_state.messages = []
|
85 |
+
|
86 |
+
for message in st.session_state.messages:
|
87 |
+
with st.chat_message(message["role"]):
|
88 |
+
st.markdown(message["content"])
|
89 |
+
|
90 |
+
if user_question := st.chat_input("Ask your document anything ......?"):
|
91 |
+
with st.chat_message("user"):
|
92 |
+
st.markdown(user_question)
|
93 |
+
user_langd, Queryd = translate(user_question, 'eng_Latn')
|
94 |
+
st.session_state.messages.append({"role": "user", "content": user_question})
|
95 |
+
docs = pdf_embeddings.similarity_search(Queryd)
|
96 |
+
# print(len(docs))
|
97 |
+
response = chain.run(input_documents=docs, question=Queryd)
|
98 |
+
output = translate(response, user_langd, 'eng_Latn')[1]
|
99 |
+
with st.chat_message("assistant"):
|
100 |
+
st.markdown(output)
|
101 |
+
st.session_state.messages.append({"role": "assistant", "content": output})
|
102 |
+
|
103 |
+
|
104 |
+
if __name__ == '__main__':
|
105 |
+
main()
|