Spaces:
Sleeping
Sleeping
File size: 1,807 Bytes
fe5256f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File Selection Drop Down
import streamlit as st
import os
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.vectorstores import Chroma
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sys,yaml,Utilities as ut
st.set_page_config(page_title="ChatPDF Ingestion", page_icon="π")
def load_pdf():
# Load the pdf file and split it into smaller chunks
initdict={}
initdict = ut.get_tokens()
hf_token = initdict["hf_token"]
embedding_model_id = initdict["embedding_model"]
chromadbpath = initdict["chatPDF_chroma_db"]
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_id)
loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
documents = loader.load()
#print (len(documents))
# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)
texts = text_splitter.split_documents(documents)
#Using Chroma vector database to store and retrieve embeddings of our text
db = Chroma.from_documents(texts, embeddings, persist_directory=chromadbpath)
return db
st.title("PatentGuru - Document Ingestion ")
# Main chat form
with st.form("chat_form"):
#query = st.text_input("You: ")
submit_button = st.form_submit_button("Upload..")
if submit_button:
load_pdf()
st.write ("Uploaded successfully") |