Mistral-7B-Summarizer-v2

Sleeping

App Files Files Community

Mistral-7B-Summarizer-v2 / app.py

Chan-Y

Update app.py

e50a70a verified 7 months ago

raw

history blame contribute delete

4.31 kB

	import warnings
	warnings.simplefilter(action='ignore', category=FutureWarning)
	import logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	import fitz
	import gradio as gr
	from langchain.prompts import PromptTemplate
	from pathlib import Path
	from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
	from langdetect import detect
	CONTEXT_WINDOW = 50_000
	from transformers import BitsAndBytesConfig

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype="float16",
	bnb_4bit_use_double_quant=True
	)
	llm = HuggingFaceEndpoint(
	repo_id="mistralai/Mistral-Nemo-Instruct-2407", #"mistralai/Mistral-7B-Instruct-v0.3",
	task="text-generation",
	model_kwargs={"quantization_config": quantization_config},
	max_new_tokens=4096,
	temperature=0.5,
	do_sample=False,
	)
	#llm_engine_hf = ChatHuggingFace(llm=llm)

	def read_pdf(file_path):
	logger.info("Reading a PDF file")
	try:
	pdf_document = fitz.open(file_path)
	text = ""
	for page_num in range(len(pdf_document)):
	page = pdf_document[page_num]
	text += page.get_text()

	if not text.strip():
	message = "PDF contains no text. It may be due to the PDF being password-protected, collapsed, or full of images."
	logger.info(message)
	return message

	return text

	except Exception as e:
	error_message = f"Error reading PDF file: {e}"
	logger.error(error_message)
	return error_message

	def read_txt(file_path):
	logger.info("Reading a TXT file")
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()
	return text
	except Exception as e:
	error_message = f"Error reading TXT file: {e}"
	logger.error(error_message)
	return error_message

	def summarize(file):
	global llm
	# Read the content of the uploaded file
	file_path = file.name
	if file_path.endswith('.pdf'):
	text = read_pdf(file_path)
	else:
	text = read_txt(file_path)

	logger.info("Length of text is %d", len(text))

	lang = detect(text[:CONTEXT_WINDOW])
	template_translate = '''
	Please carefully read the following document:
	<document>
	{TEXT}
	</document>
	After reading through the document, pinpoint the key points and main ideas covered in the text.
	Organize these key points into a concise bulleted list that summarizes the essential information from the document.
	The summary should be in {LANG} language.
	'''

	prompt_summarize = PromptTemplate(
	template=template_translate,
	input_variables=["TEXT", "LANG"]
	)

	summaries = []
	for i in range(0, len(text), CONTEXT_WINDOW):
	chunk = text[i:i + CONTEXT_WINDOW]
	formatted_prompt = prompt_summarize.format(TEXT=chunk, LANG=lang)
	summary = llm.invoke(formatted_prompt)
	summaries.append(summary)

	logger.info(f"Chunked into {len(summaries)}.")

	final_summary = "\n\n".join(summaries)
	return final_summary

	def download_summary(output_text):
	if output_text:
	file_path = Path('summary.txt')
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(output_text)
	return file_path
	else:
	return None
	def create_download_file(summary_text):
	file_path = download_summary(summary_text)
	return str(file_path) if file_path else None

	# Create the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## Document Summarizer")

	with gr.Row():
	with gr.Column():
	file = gr.File(label="Submit a file")

	with gr.Column():
	output_text = gr.Textbox(label="Summary", lines=20)

	submit_button = gr.Button("Summarize")
	submit_button.click(summarize, inputs=[file], outputs=output_text)

	def generate_file():
	summary_text = output_text
	file_path = download_summary(summary_text)
	return file_path

	download_button = gr.Button("Download Summary")
	download_button.click(
	fn=create_download_file,
	inputs=[output_text],
	outputs=gr.File()
	)
	# Run the Gradio app
	demo.launch(share=True)