File size: 1,807 Bytes
fe5256f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File Selection Drop Down
import streamlit as st
import os
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.vectorstores import Chroma
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sys,yaml,Utilities as ut


st.set_page_config(page_title="ChatPDF Ingestion", page_icon="πŸ“ˆ")

def load_pdf():
   
   # Load the pdf file and split it into smaller chunks
   initdict={}
   initdict = ut.get_tokens()
   hf_token = initdict["hf_token"]
   embedding_model_id = initdict["embedding_model"]
   chromadbpath = initdict["chatPDF_chroma_db"]
   
   embeddings = HuggingFaceEmbeddings(model_name=embedding_model_id)
   
   loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)

   documents = loader.load()
   #print (len(documents))
   
   # Split the documents into smaller chunks 

   text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)
   texts = text_splitter.split_documents(documents)
    
   #Using Chroma vector database to store and retrieve embeddings of our text
   db = Chroma.from_documents(texts, embeddings, persist_directory=chromadbpath)
   return db

st.title("PatentGuru  - Document Ingestion ")
# Main chat form
with st.form("chat_form"):
    #query = st.text_input("You: ")
    submit_button = st.form_submit_button("Upload..")    

if submit_button:
    load_pdf()
        
    st.write ("Uploaded successfully")