pdfchat / version /semapdf1.0.py
kamau1's picture
Create version/semapdf1.0.py
2397ad2 verified
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
from langchain.callbacks import get_openai_callback
from PyPDF2 import PdfReader
import json
import openai
import streamlit as st
import os
import requests
# Page configuration
st.set_page_config(page_title="PesaQ", page_icon="๐Ÿ’ธ", layout="wide",)
#set Open-AI key
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
# Sema Translator
def translate(userinput, target_lang, source_lang=None):
if source_lang:
url = "https://5d5c-44-208-85-154.ngrok-free.app/translate_enter/"
data = {
"userinput": userinput,
"source_lang": source_lang,
"target_lang": target_lang,
}
response = requests.post(url, json=data)
result = response.json()
print(type(result))
source_lange = source_lang
translation = result['translated_text']
return source_lange, translation
else:
url = "https://5d5c-44-208-85-154.ngrok-free.app/translate_detect/"
data = {
"userinput": userinput,
"target_lang": target_lang,
}
response = requests.post(url, json=data)
result = response.json()
source_lange = result['source_language']
translation = result['translated_text']
return source_lange, translation
def main():
st.title("๐Ÿ“š PesaDoc")
# upload file
pdf = st.file_uploader("Upload a financial Document and ask questions to get insights", type="pdf")
# extract the text
if pdf is not None:
reader = PdfReader(pdf)
pdf_text = ''
for page in (reader.pages):
text = page.extract_text()
if text:
pdf_text += text
# Define our text splitter
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 1000, #thousand charctere
chunk_overlap = 200,
length_function = len,
)
#Apply splitting
text_chunks = text_splitter.split_text(pdf_text)
# Use embeddings from OpenAI
embeddings = OpenAIEmbeddings()
#Convert text to embeddings
pdf_embeddings = FAISS.from_texts(text_chunks, embeddings)
chain = load_qa_chain(OpenAI(), chain_type="stuff")
#user_question = st.text_input("Get insights into your finances ...")
# show user input
if "messages" not in st.session_state:
st.session_state.messages = []
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if user_question := st.chat_input("Ask your document anything ......?"):
with st.chat_message("user"):
st.markdown(user_question)
user_langd, Queryd = translate(user_question, 'eng_Latn')
st.session_state.messages.append({"role": "user", "content": user_question})
docs = pdf_embeddings.similarity_search(Queryd)
# print(len(docs))
response = chain.run(input_documents=docs, question=Queryd)
output = translate(response, user_langd, 'eng_Latn')[1]
with st.chat_message("assistant"):
st.markdown(output)
st.session_state.messages.append({"role": "assistant", "content": output})
if __name__ == '__main__':
main()