Spaces:

Chan-Y
/

Mistral-7B-Summarizer

Sleeping

App Files Files Community

Mistral-7B-Summarizer / app.py

Chan-Y

Update app.py

026783f verified 5 months ago

raw

history blame

3.7 kB

	import warnings
	warnings.simplefilter(action='ignore', category=FutureWarning)

	import PyPDF2
	import gradio as gr
	from langchain.prompts import PromptTemplate
	from langchain.chains.summarize import load_summarize_chain
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import DirectoryLoader
	from langchain_core.documents import Document
	from pathlib import Path
	from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

	llm = HuggingFaceEndpoint(
	repo_id="mistralai/Mistral-7B-Instruct-v0.3",
	task="text-generation",
	max_new_tokens=1025,
	do_sample=False,
	)
	llm_engine_hf = ChatHuggingFace(llm=llm)

	def read_pdf(file_path):
	pdf_reader = PyPDF2.PdfReader(file_path)
	text = ""
	for page in range(len(pdf_reader.pages)):
	text += pdf_reader.pages[page].extract_text()
	return text

	def summarize(file, n_words):
	# Read the content of the uploaded file
	file_path = file.name
	if file_path.endswith('.pdf'):
	file_content = read_pdf(file_path)
	else:
	with open(file_path, 'r', encoding='utf-8') as f:
	file_content = f.read()

	document = Document(file_content)
	# Generate the summary
	text = document.page_content
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
	chunks = text_splitter.create_documents([text])
	n_words = n_words
	template = ''' [INST]
	Your task is to summarize a long text into a concise summary of a specific number of words.

	The summary you generate must be EXACTLY {N_WORDS} words long.

	Before writing your final summary, first break down the key points of the text in a <scratchpad>. Identify the most important information that should be included in a summary of the specified length.

	Then, write a summary that captures the core ideas and key details of the text. Start with an introductory sentence and then concisely summarize the main points in a logical order. Make sure to stay within the {{N_WORDS}} word limit.

	Here is the long text to summarize:
	Text:
	{TEXT}


	[/INST]
	'''
	prompt = PromptTemplate(
	template=template,
	input_variables=['TEXT', "N_WORDS"]
	)
	formatted_prompt = prompt.format(TEXT=text, N_WORDS=n_words)
	output_summary = llm_engine_hf.invoke(formatted_prompt)
	return output_summary.content

	def download_summary(output_text):
	if output_text:
	file_path = Path('summary.txt')
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(output_text)
	return file_path
	else:
	return None
	def create_download_file(summary_text):
	file_path = download_summary(summary_text)
	return str(file_path) if file_path else None

	# Create the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## Document Summarizer")

	with gr.Row():
	with gr.Column():
	n_words = gr.Slider(minimum=50, maximum=500, step=50, label="Number of words (approximately)")
	file = gr.File(label="Submit a file")

	with gr.Column():
	output_text = gr.Textbox(label="Summary", lines=20)

	submit_button = gr.Button("Summarize")
	submit_button.click(summarize, inputs=[file, n_words], outputs=output_text)

	def generate_file():
	summary_text = output_text
	file_path = download_summary(summary_text)
	return file_path

	download_button = gr.Button("Download Summary")
	download_button.click(
	fn=create_download_file,
	inputs=[output_text],
	outputs=gr.File()
	)
	# Run the Gradio app
	demo.launch(share=True)