Chan-Y's picture
Update app.py
65033bf verified
raw
history blame
3.54 kB
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import fitz
import gradio as gr
from langchain.prompts import PromptTemplate
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_core.output_parsers import JsonOutputParser
from langdetect import detect
CONTEXT_WINDOW = 50_000
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
task="text-generation",
max_new_tokens=4096,
temperature=0.5,
do_sample=False,
)
llm_engine_hf = ChatHuggingFace(llm=llm)
def read_pdf(file_path):
print("It is a PDF file")
try:
pdf_document = fitz.open(file_path)
text = ""
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text += page.get_text()
return text
except Exception as e:
print("Error reading file,", e)
def read_txt(file_path):
print("It is not a PDF file")
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
return text
def summarize(file, n_words):
global llm
# Read the content of the uploaded file
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
text = read_txt(file_path)
print("Length of text is ", len(text))
if len(text) > CONTEXT_WINDOW:
print(f"Slicing the first {CONTEXT_WINDOW} characters")
text = text[:CONTEXT_WINDOW]
lang = detect(text[:CONTEXT_WINDOW])
template_translate = '''
Please carefully read the following document:
<document>
{TEXT}
</document>
After reading through the document, pinpoint the key points and main ideas covered in the text.
Organize these key points into a concise bulleted list that summarizes the essential information from the document.
The summary should be in {LANG} language.
'''
prompt_summarize = PromptTemplate(
template=template_translate,
input_variables=["TEXT", "LANG"]
)
summaries = []
for i in range(0, len(text), CONTEXT_WINDOW):
chunk = text[i:i + CONTEXT_WINDOW]
formatted_prompt = prompt_summarize.format(TEXT=chunk, LANG=lang)
summary = llm.invoke(formatted_prompt)
summaries.append(summary)
final_summary = "\n\n".join(summaries)
return final_summary
def download_summary(output_text):
if output_text:
file_path = Path('summary.txt')
with open(file_path, 'w', encoding='utf-8') as f:
f.write(output_text)
return file_path
else:
return None
def create_download_file(summary_text):
file_path = download_summary(summary_text)
return str(file_path) if file_path else None
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Document Summarizer")
with gr.Row():
with gr.Column():
file = gr.File(label="Submit a file")
with gr.Column():
output_text = gr.Textbox(label="Summary", lines=20)
submit_button = gr.Button("Summarize")
submit_button.click(summarize, inputs=[file], outputs=output_text)
def generate_file():
summary_text = output_text
file_path = download_summary(summary_text)
return file_path
download_button = gr.Button("Download Summary")
download_button.click(
fn=create_download_file,
inputs=[output_text],
outputs=gr.File()
)
# Run the Gradio app
demo.launch(share=True)