Spaces:
Sleeping
Sleeping
import warnings | |
warnings.simplefilter(action='ignore', category=FutureWarning) | |
import PyPDF2 | |
import gradio as gr | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import DirectoryLoader | |
from langchain_core.documents import Document | |
from pathlib import Path | |
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint | |
llm = HuggingFaceEndpoint( | |
repo_id="mistralai/Mistral-7B-Instruct-v0.3", | |
task="text-generation", | |
max_new_tokens=1025, | |
do_sample=False, | |
) | |
llm_engine_hf = ChatHuggingFace(llm=llm) | |
def read_pdf(file_path): | |
pdf_reader = PyPDF2.PdfReader(file_path) | |
text = "" | |
for page in range(len(pdf_reader.pages)): | |
text += pdf_reader.pages[page].extract_text() | |
return text | |
def summarize(file, n_words): | |
# Read the content of the uploaded file | |
file_path = file.name | |
if file_path.endswith('.pdf'): | |
file_content = read_pdf(file_path) | |
else: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
file_content = f.read() | |
document = Document(file_content) | |
# Generate the summary | |
text = document.page_content | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200) | |
chunks = text_splitter.create_documents([text]) | |
n_words = n_words | |
template = ''' [INST] | |
Your task is to summarize a long text into a concise summary of a specific number of words. | |
The summary you generate must be EXACTLY {N_WORDS} words long. | |
Before writing your final summary, first break down the key points of the text in a <scratchpad>. Identify the most important information that should be included in a summary of the specified length. | |
Then, write a summary that captures the core ideas and key details of the text. Start with an introductory sentence and then concisely summarize the main points in a logical order. Make sure to stay within the {{N_WORDS}} word limit. | |
Here is the long text to summarize: | |
Text: | |
{TEXT} | |
[/INST] | |
''' | |
prompt = PromptTemplate( | |
template=template, | |
input_variables=['TEXT', "N_WORDS"] | |
) | |
formatted_prompt = prompt.format(TEXT=text, N_WORDS=n_words) | |
output_summary = llm_engine_hf.invoke(formatted_prompt) | |
return output_summary.content | |
def download_summary(output_text): | |
if output_text: | |
file_path = Path('summary.txt') | |
with open(file_path, 'w', encoding='utf-8') as f: | |
f.write(output_text) | |
return file_path | |
else: | |
return None | |
def create_download_file(summary_text): | |
file_path = download_summary(summary_text) | |
return str(file_path) if file_path else None | |
# Create the Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("## Document Summarizer") | |
with gr.Row(): | |
with gr.Column(): | |
n_words = gr.Slider(minimum=50, maximum=500, step=50, label="Number of words (approximately)") | |
file = gr.File(label="Submit a file") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Summary", lines=20) | |
submit_button = gr.Button("Summarize") | |
submit_button.click(summarize, inputs=[file, n_words], outputs=output_text) | |
def generate_file(): | |
summary_text = output_text | |
file_path = download_summary(summary_text) | |
return file_path | |
download_button = gr.Button("Download Summary") | |
download_button.click( | |
fn=create_download_file, | |
inputs=[output_text], | |
outputs=gr.File() | |
) | |
# Run the Gradio app | |
demo.launch(share=True) |