# !pip install -q gpt_index # !pip install llama-index # !pip install -q PyPDF2 # !pip install -q gradio # # for scanned pdf # !sudo apt-get install -y poppler-utils # !sudo apt-get install -y tesseract-ocr # !pip install -q pytesseract # !pip install -q pdf2image import subprocess import sys import os # Install the package # python -m pip install --upgrade pip subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"]) subprocess.run(["pip", "install", "llama-index"]) subprocess.run(["pip", "install", "PyPDF2"]) # subprocess.run(["apt-get", "update", "-y"]) # subprocess.run(["apt-get", "install", "-y","poppler-utils"]) os.system('apt-get install -y poppler-utils') # !sudo apt-get install -y poppler-utils subprocess.run(["apt-get", "install", "-y","tesseract-ocr"]) subprocess.run(["pip", "install", "pytesseract"]) subprocess.run(["pip", "install", "pdf2image"]) # subprocess.run(["pip", "install", "llama-index"]) # subprocess.run(["pip", "install", "llama-index"]) # folder_path = "/content/doc" home_path = "/home/user/app/" folder_path = "/home/user/app/doc/" # os.environ["OPENAI_API_KEY"] = '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' os.environ["OPENAI_API_KEY"] = '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' # os.environ["OPENAI_API_KEY"] = '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' # from CHGPT # os.environ["OPENAI_API_KEY"] = '' # from CHGPT # OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # from gpt_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper # from gpt_index.readers.file.docs_parser import PDFParser # from gpt_index.readers.schema.base import Document # llama-index from llama_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper from llama_index.readers.file.docs_parser import PDFParser from llama_index.readers.schema.base import Document from langchain import OpenAI, PromptTemplate, LLMChain from langchain.text_splitter import CharacterTextSplitter # from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains.mapreduce import MapReduceChain from langchain.prompts import PromptTemplate # for pdf image import pdf2image import pytesseract from pytesseract import Output llm = OpenAI(temperature=0) text_splitter = CharacterTextSplitter() # from langchain.docstore.document import Document # from langchain.chains.summarize import load_summarize_chain # docs = [Document(page_content=t) for t in texts[:4]] # chain = load_summarize_chain(llm, chain_type="map_reduce") # chain.run(docs) # chain = load_summarize_chain(llm, chain_type="stuff") # chain.run(docs) # prompt_template = """Write a concise summary of the following: # {text} # CONCISE SUMMARY IN ZH-HK:""" # PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"]) # chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT) # chain.run(docs) # chain = load_summarize_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True) # chain({"input_documents": docs}, return_only_outputs=True) # chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=True) # chain({"input_documents": docs}, return_only_outputs=True) """# Output ChatBox""" import gradio as gr from PyPDF2 import PdfReader from langchain.docstore.document import Document from langchain.chains.summarize import load_summarize_chain def extractScannedPDF(filePath, chainType): pdf_path = filePath images = pdf2image.convert_from_path(pdf_path) counter = 0 text = "" print('OCR Scanned PDF') for pil_im in images: print('Page ' + str(counter)) counter += 1 if counter >= 3: break text += "\nPage " + str(counter) + "\n" ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT) text += " ".join(ocr_dict['text']) + "\n" # folder_path = "/content/doc" print('Save to output2.txt') if not os.path.exists(folder_path): os.makedirs(folder_path) print(f"Folder {folder_path} created.") else: print(f"Folder {folder_path} already exists.") with open(folder_path + 'output2.txt', 'w') as f: f.write(text) with open(folder_path + 'output2.txt') as f: docRead = f.read() documents = SimpleDirectoryReader(folder_path).load_data() index = GPTSimpleVectorIndex.from_documents(documents) index.save_to_disk('index2.json') print('chunking ...') # text_splitter = RecursiveCharacterTextSplitter( # # Set a really small chunk size, just to show. # chunk_size = 3000, # chunk_overlap = 20, # length_function = len, # ) # texts = text_splitter.create_documents(docRead) texts = text_splitter.split_text(docRead) # docs = [Document(page_content=t) for t in texts[:3]] docs = [Document(page_content=t) for t in texts] print('Summarising ...') chain = load_summarize_chain(llm, chain_type=chainType) return chain.run(docs) def extractPDF(filePath, chainType): reader = PdfReader(filePath) text = "" counter = 0 print('Processing Text ... ') for txt in reader.pages: counter += 1 text += "\nPage " + str(counter) + "\n" text += txt.extract_text() + "\n" print('Total No. of pages = ', counter) print('Save to output1.txt') if not os.path.exists(folder_path): os.makedirs(folder_path) print(f"Folder {folder_path} created.") else: print(f"Folder {folder_path} already exists.") with open(folder_path + 'output1.txt', 'w') as f: f.write(text) with open(folder_path + 'output1.txt') as f: docRead = f.read() documents = SimpleDirectoryReader(folder_path).load_data() index = GPTSimpleVectorIndex.from_documents(documents) index.save_to_disk('index1.json') print('chunking ...') # text_splitter = RecursiveCharacterTextSplitter( # # Set a really small chunk size, just to show. # chunk_size = 3000, # chunk_overlap = 20, # length_function = len, # ) # texts = text_splitter.create_documents(docRead) texts = text_splitter.split_text(docRead) # docs = [Document(page_content=t) for t in texts[:3]] docs = [Document(page_content=t) for t in texts] print('Summarising ...') chain = load_summarize_chain(llm, chain_type=chainType) return chain.run(docs) # chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=False) # return chain({"input_documents": docs}, return_only_outputs=True)['output_text'] def qa1(query, rmode): index = GPTSimpleVectorIndex.load_from_disk('index1.json') response = index.query(query, response_mode = rmode) return response.response def qa2(query, rmode): index = GPTSimpleVectorIndex.load_from_disk('index2.json') response = index.query(query, response_mode = rmode) return response.response def on_token_change(user_token): # print("use user inputed API key" + str(len(user_token))) // API key length 51 if(len(user_token) == 51): os.environ["OPENAI_API_KEY"] = user_token def pdfv1(files, chainType): newPath = home_path new_name = 't1' ext = 'pdf' # Separate file name and extension # name, ext = os.path.splitext(files.name) # Concatenate new name and original extension new_path = os.path.join(newPath, new_name + ext) # Move file to new location with new name print(files.name) os.rename(files.name, new_path) output = extractPDF(new_path, chainType) return output def pdfv2(files, chainType): newPath = home_path new_name = 't2' ext = 'pdf' # # Separate file name and extension # name, ext = os.path.splitext(files.name) # # Concatenate new name and original extension new_path = os.path.join(newPath, new_name + ext) # Move file to new location with new name print(files.name) os.rename(files.name, new_path) # output = extractScannedPDF(new_path, chainType) output = extractScannedPDF(files.name, chainType) return output def pdfv3(in1, in2): return 'ok!!' with gr.Blocks() as demo: gr.Markdown( """ # PDF Summariser (powered by OPENAI and LangChain) """) user_token = gr.Textbox( show_label=True, placeholder=f"OpenAI API-key...", # value=hide_middle_chars(my_api_key), type="password", # visible=not HIDE_MY_KEY, label="API-Key (Copy and Paste Here)" ) user_token.change(on_token_change, inputs=[user_token], outputs=[]) with gr.Tab("Summarise PDF"): inp1 = gr.File(label="Input PDF") chainType1 = gr.Radio( ["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce" ) doSum1 = gr.Button("Summarise") out1 = gr.Textbox(label="Summary") inp1.change(pdfv1, inputs=[inp1,chainType1], outputs=[out1]) doSum1.click(pdfv1, inputs=[inp1,chainType1], outputs=[out1]) gr.Markdown("""# Q&A""") question1 = gr.Textbox(label="Question related to the pdf", placeholder = "Question...") gr.Examples( examples=["what is the main idea of this journal?","when did this paper publish?"], inputs=question1, # outputs=answer, # fn = qa1, # cache_examples=False, ) radio1 = gr.Radio( ["default", "compact", "tree_summarize"], label="response_mode", value="default" ) b1 = gr.Button("Query") answer1 = gr.Textbox(label="Answer") b1.click(qa1, inputs=[question1,radio1], outputs=answer1) with gr.Tab("Summarise Scanned PDF"): inp2 = gr.File(label="Input PDF") chainType2 = gr.Radio( ["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce" ) doSum2 = gr.Button("Summarise") out2 = gr.Textbox(label="Summary") inp2.change(pdfv2, inputs=[inp2,chainType2], outputs=[out2]) doSum2.click(pdfv2, inputs=[inp2,chainType2], outputs=[out2]) gr.Markdown("""# Q&A""") question2 = gr.Textbox(label="Question related to the pdf") gr.Examples( examples=["what is the main idea of this journal?","when did this paper publish?"], inputs=question2, # outputs=answer, # fn = qa1, # cache_examples=False, ) radio2 = gr.Radio( ["default", "compact", "tree_summarize"], label="response_mode", value="default" ) b2 = gr.Button("Query") answer2 = gr.Textbox(label="Answer") b2.click(qa2, inputs=[question2,radio2], outputs=answer2) # with gr.Tab("Reserved"): # inp3a = gr.Textbox() # inp3b = gr.Textbox() # out3 = gr.Textbox() # inp3a.change(pdfv3, [inp3a,inp3b], out3) # with gr.Accordion("Open for More!"): # gr.Markdown("Look at me...") if __name__ == "__main__": demo.launch(debug = True)