# !pip install -q gpt_index

# !pip install llama-index
# !pip install -q PyPDF2
# !pip install -q gradio

# # for scanned pdf
# !sudo apt-get install -y poppler-utils
# !sudo apt-get install -y tesseract-ocr
# !pip install -q pytesseract
# !pip install -q pdf2image

import subprocess
import sys
import os

# Install the package
# python -m pip install --upgrade pip
subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])

subprocess.run(["pip", "install", "llama-index"])
subprocess.run(["pip", "install", "PyPDF2"])

# subprocess.run(["apt-get", "update", "-y"])
# subprocess.run(["apt-get", "install", "-y","poppler-utils"])
os.system('apt-get install -y poppler-utils')
# !sudo apt-get install -y poppler-utils
subprocess.run(["apt-get", "install", "-y","tesseract-ocr"])

subprocess.run(["pip", "install", "pytesseract"])
subprocess.run(["pip", "install", "pdf2image"])
# subprocess.run(["pip", "install", "llama-index"])
# subprocess.run(["pip", "install", "llama-index"])


# folder_path = "/content/doc"
home_path = "/home/user/app/"
folder_path = "/home/user/app/doc/"


# os.environ["OPENAI_API_KEY"] = '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'
os.environ["OPENAI_API_KEY"] = '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'
# os.environ["OPENAI_API_KEY"] = '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'      # from CHGPT
# os.environ["OPENAI_API_KEY"] = ''      # from CHGPT

# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# from gpt_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
# from gpt_index.readers.file.docs_parser import PDFParser
# from gpt_index.readers.schema.base import Document

# llama-index
from llama_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
from llama_index.readers.file.docs_parser import PDFParser
from llama_index.readers.schema.base import Document

from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate

# for pdf image
import pdf2image
import pytesseract
from pytesseract import Output


llm = OpenAI(temperature=0)

text_splitter = CharacterTextSplitter()

# from langchain.docstore.document import Document
# from langchain.chains.summarize import load_summarize_chain

# docs = [Document(page_content=t) for t in texts[:4]]

# chain = load_summarize_chain(llm, chain_type="map_reduce")
# chain.run(docs)

# chain = load_summarize_chain(llm, chain_type="stuff")
# chain.run(docs)

# prompt_template = """Write a concise summary of the following:


# {text}


# CONCISE SUMMARY IN ZH-HK:"""
# PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
# chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)
# chain.run(docs)

# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True)
# chain({"input_documents": docs}, return_only_outputs=True)

# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=True)
# chain({"input_documents": docs}, return_only_outputs=True)

"""# Output ChatBox"""

import gradio as gr
from PyPDF2 import PdfReader

from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain

def extractScannedPDF(filePath, chainType):
    pdf_path = filePath
    images = pdf2image.convert_from_path(pdf_path)
    counter = 0
    text = ""

    print('OCR Scanned PDF')
    for pil_im in images:
        print('Page ' + str(counter))
        counter += 1
        if counter >= 3:
          break
        text += "\nPage " + str(counter) + "\n"
        ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
        text += " ".join(ocr_dict['text']) + "\n"

    # folder_path = "/content/doc"

    print('Save to output2.txt')
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder {folder_path} created.")
    else:
        print(f"Folder {folder_path} already exists.")

    with open(folder_path + 'output2.txt', 'w') as f:
        f.write(text)
    with open(folder_path + 'output2.txt') as f:
        docRead = f.read()

    documents = SimpleDirectoryReader(folder_path).load_data()
    index = GPTSimpleVectorIndex.from_documents(documents)    
    index.save_to_disk('index2.json')

    print('chunking ...')    
    # text_splitter = RecursiveCharacterTextSplitter(
    #   # Set a really small chunk size, just to show.
    #   chunk_size = 3000,
    #   chunk_overlap  = 20,
    #   length_function = len,
    # )
    # texts = text_splitter.create_documents(docRead)

    texts = text_splitter.split_text(docRead)     

    # docs = [Document(page_content=t) for t in texts[:3]]
    docs = [Document(page_content=t) for t in texts]

    print('Summarising ...')
    chain = load_summarize_chain(llm, chain_type=chainType)
    return chain.run(docs)

def extractPDF(filePath, chainType):
  reader = PdfReader(filePath)
  text = ""
  counter = 0
  print('Processing Text ... ')
  for txt in reader.pages:
    counter += 1
    text += "\nPage " + str(counter) + "\n"
    text += txt.extract_text() + "\n"
  print('Total No. of pages = ', counter)

  print('Save to output1.txt')
  if not os.path.exists(folder_path):
      os.makedirs(folder_path)
      print(f"Folder {folder_path} created.")
  else:
      print(f"Folder {folder_path} already exists.")

  with open(folder_path + 'output1.txt', 'w') as f:
      f.write(text)
  with open(folder_path + 'output1.txt') as f:
      docRead = f.read()

  documents = SimpleDirectoryReader(folder_path).load_data()
  index = GPTSimpleVectorIndex.from_documents(documents)    
  index.save_to_disk('index1.json')

  print('chunking ...')    
  # text_splitter = RecursiveCharacterTextSplitter(
  #   # Set a really small chunk size, just to show.
  #   chunk_size = 3000,
  #   chunk_overlap  = 20,
  #   length_function = len,
  # )
  # texts = text_splitter.create_documents(docRead)

  texts = text_splitter.split_text(docRead)  

  # docs = [Document(page_content=t) for t in texts[:3]]
  docs = [Document(page_content=t) for t in texts]

  print('Summarising ...')
  chain = load_summarize_chain(llm, chain_type=chainType)
  return chain.run(docs)

  # chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=False)
  # return chain({"input_documents": docs}, return_only_outputs=True)['output_text']

def qa1(query, rmode):
  index = GPTSimpleVectorIndex.load_from_disk('index1.json')
  response = index.query(query, response_mode = rmode)
  return response.response

def qa2(query, rmode):
  index = GPTSimpleVectorIndex.load_from_disk('index2.json')
  response = index.query(query, response_mode = rmode)
  return response.response

def on_token_change(user_token):
    # print("use user inputed API key" + str(len(user_token)))    // API key length 51
    if(len(user_token) == 51):
      os.environ["OPENAI_API_KEY"] = user_token

def pdfv1(files, chainType):

    newPath = home_path
    new_name = 't1'
    ext = 'pdf'
    # Separate file name and extension
    # name, ext = os.path.splitext(files.name)
    # Concatenate new name and original extension
    new_path = os.path.join(newPath, new_name + ext)
    # Move file to new location with new name
    print(files.name)
    os.rename(files.name, new_path)  
 
    output = extractPDF(new_path, chainType)

    return output

def pdfv2(files, chainType):

    newPath = home_path
    new_name = 't2'
    ext = 'pdf'
    # # Separate file name and extension
    # name, ext = os.path.splitext(files.name)
    # # Concatenate new name and original extension
    new_path = os.path.join(newPath, new_name + ext)
    # Move file to new location with new name
    print(files.name)
    os.rename(files.name, new_path)  
 
    # output = extractScannedPDF(new_path, chainType)
    output = extractScannedPDF(files.name, chainType)

    return output

def pdfv3(in1, in2):

    return 'ok!!'


with gr.Blocks() as demo:
    gr.Markdown(
    """
    # PDF Summariser
    (powered by OPENAI and LangChain)
    """)
    user_token = gr.Textbox(
        show_label=True,
        placeholder=f"OpenAI API-key...",
        # value=hide_middle_chars(my_api_key),
        type="password",
        # visible=not HIDE_MY_KEY,
        label="API-Key (Copy and Paste Here)"
        )
    user_token.change(on_token_change, inputs=[user_token], outputs=[])    
    with gr.Tab("Summarise PDF"):
      inp1 = gr.File(label="Input PDF")
      chainType1 = gr.Radio(
        ["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce"
      )
      doSum1 = gr.Button("Summarise")
      out1 = gr.Textbox(label="Summary")

      inp1.change(pdfv1, inputs=[inp1,chainType1], outputs=[out1])
      doSum1.click(pdfv1, inputs=[inp1,chainType1], outputs=[out1])        

      gr.Markdown("""# Q&A""")
      question1 = gr.Textbox(label="Question related to the pdf", placeholder = "Question...")
      gr.Examples(
        examples=["what is the main idea of this journal?","when did this paper publish?"],
        inputs=question1,
        # outputs=answer,
        # fn = qa1,
        # cache_examples=False,
      ) 
      radio1 = gr.Radio(
        ["default", "compact", "tree_summarize"], label="response_mode", value="default"
      )
      b1 = gr.Button("Query")
      answer1 = gr.Textbox(label="Answer")
      b1.click(qa1, inputs=[question1,radio1], outputs=answer1)

    with gr.Tab("Summarise Scanned PDF"):     
      inp2 = gr.File(label="Input PDF")
      chainType2 = gr.Radio(
        ["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce"
      )
      doSum2 = gr.Button("Summarise")
      out2 = gr.Textbox(label="Summary")

      inp2.change(pdfv2, inputs=[inp2,chainType2], outputs=[out2])
      doSum2.click(pdfv2, inputs=[inp2,chainType2], outputs=[out2])  

      gr.Markdown("""# Q&A""")    
      question2 = gr.Textbox(label="Question related to the pdf")
      gr.Examples(
        examples=["what is the main idea of this journal?","when did this paper publish?"],
        inputs=question2,
        # outputs=answer,
        # fn = qa1,
        # cache_examples=False,
      ) 
      radio2 = gr.Radio(
        ["default", "compact", "tree_summarize"], label="response_mode", value="default"
      )
      b2 = gr.Button("Query")
      answer2 = gr.Textbox(label="Answer")
      b2.click(qa2, inputs=[question2,radio2], outputs=answer2)  

    # with gr.Tab("Reserved"):
    #   inp3a = gr.Textbox()
    #   inp3b = gr.Textbox()
    #   out3 = gr.Textbox()
    #   inp3a.change(pdfv3, [inp3a,inp3b], out3)
    # with gr.Accordion("Open for More!"):
    #     gr.Markdown("Look at me...")


if __name__ == "__main__":
    demo.launch(debug = True)