Chan-Y's picture
Update app.py
8639815 verified
raw
history blame
3.7 kB
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import PyPDF2
import gradio as gr
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_core.documents import Document
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
task="text-generation",
max_new_tokens=4096,
do_sample=False,
)
llm_engine_hf = ChatHuggingFace(llm=llm)
def read_pdf(file_path):
pdf_reader = PyPDF2.PdfReader(file_path)
text = ""
for page in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page].extract_text()
return text
def summarize(file, n_words):
# Read the content of the uploaded file
file_path = file.name
if file_path.endswith('.pdf'):
file_content = read_pdf(file_path)
else:
with open(file_path, 'r', encoding='utf-8') as f:
file_content = f.read()
document = Document(file_content)
# Generate the summary
text = document.page_content
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
chunks = text_splitter.create_documents([text])
n_words = n_words
template = ''' [INST]
Your task is to summarize a long text into a concise summary of a specific number of words.
The summary you generate must be EXACTLY {N_WORDS} words long.
Before writing your final summary, first break down the key points of the text in a <scratchpad>. Identify the most important information that should be included in a summary of the specified length.
Then, write a summary that captures the core ideas and key details of the text. Start with an introductory sentence and then concisely summarize the main points in a logical order. Make sure to stay within the {{N_WORDS}} word limit.
Here is the long text to summarize:
Text:
{TEXT}
[/INST]
'''
prompt = PromptTemplate(
template=template,
input_variables=['TEXT', "N_WORDS"]
)
formatted_prompt = prompt.format(TEXT=text, N_WORDS=n_words)
output_summary = llm_engine_hf.invoke(formatted_prompt)
return output_summary.content
def download_summary(output_text):
if output_text:
file_path = Path('summary.txt')
with open(file_path, 'w', encoding='utf-8') as f:
f.write(output_text)
return file_path
else:
return None
def create_download_file(summary_text):
file_path = download_summary(summary_text)
return str(file_path) if file_path else None
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Document Summarizer")
with gr.Row():
with gr.Column():
n_words = gr.Slider(minimum=50, maximum=500, step=50, label="Number of words (approximately)")
file = gr.File(label="Submit a file")
with gr.Column():
output_text = gr.Textbox(label="Summary", lines=20)
submit_button = gr.Button("Summarize")
submit_button.click(summarize, inputs=[file, n_words], outputs=output_text)
def generate_file():
summary_text = output_text
file_path = download_summary(summary_text)
return file_path
download_button = gr.Button("Download Summary")
download_button.click(
fn=create_download_file,
inputs=[output_text],
outputs=gr.File()
)
# Run the Gradio app
demo.launch(share=True)