import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import PyPDF2 import gradio as gr from langchain.prompts import PromptTemplate from langchain.chains.summarize import load_summarize_chain from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import DirectoryLoader from langchain_core.documents import Document from pathlib import Path from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint llm = HuggingFaceEndpoint( repo_id="mistralai/Mistral-7B-Instruct-v0.3", task="text-generation", max_new_tokens=1025, do_sample=False, ) llm_engine_hf = ChatHuggingFace(llm=llm) def read_pdf(file_path): pdf_reader = PyPDF2.PdfReader(file_path) text = "" for page in range(len(pdf_reader.pages)): text += pdf_reader.pages[page].extract_text() return text def summarize(file, n_words): # Read the content of the uploaded file file_path = file.name if file_path.endswith('.pdf'): file_content = read_pdf(file_path) else: with open(file_path, 'r', encoding='utf-8') as f: file_content = f.read() document = Document(file_content) # Generate the summary text = document.page_content text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200) chunks = text_splitter.create_documents([text]) n_words = n_words template = ''' [INST] Your task is to summarize a long text into a concise summary of a specific number of words. The summary you generate must be EXACTLY {N_WORDS} words long. Before writing your final summary, first break down the key points of the text in a . Identify the most important information that should be included in a summary of the specified length. Then, write a summary that captures the core ideas and key details of the text. Start with an introductory sentence and then concisely summarize the main points in a logical order. Make sure to stay within the {{N_WORDS}} word limit. Here is the long text to summarize: Text: {TEXT} [/INST] ''' prompt = PromptTemplate( template=template, input_variables=['TEXT', "N_WORDS"] ) formatted_prompt = prompt.format(TEXT=text, N_WORDS=n_words) output_summary = llm_engine_hf.invoke(formatted_prompt) return output_summary.content def download_summary(output_text): if output_text: file_path = Path('summary.txt') with open(file_path, 'w', encoding='utf-8') as f: f.write(output_text) return file_path else: return None def create_download_file(summary_text): file_path = download_summary(summary_text) return str(file_path) if file_path else None # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("## Document Summarizer") with gr.Row(): with gr.Column(): n_words = gr.Slider(minimum=50, maximum=500, step=50, label="Number of words (approximately)") file = gr.File(label="Submit a file") with gr.Column(): output_text = gr.Textbox(label="Summary", lines=20) submit_button = gr.Button("Summarize") submit_button.click(summarize, inputs=[file, n_words], outputs=output_text) def generate_file(): summary_text = output_text file_path = download_summary(summary_text) return file_path download_button = gr.Button("Download Summary") download_button.click( fn=create_download_file, inputs=[output_text], outputs=gr.File() ) # Run the Gradio app demo.launch(share=True)