Spaces:
Sleeping
Sleeping
import warnings | |
warnings.simplefilter(action='ignore', category=FutureWarning) | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
import fitz | |
import gradio as gr | |
from langchain.prompts import PromptTemplate | |
from pathlib import Path | |
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint | |
from langdetect import detect | |
CONTEXT_WINDOW = 50_000 | |
from transformers import BitsAndBytesConfig | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype="float16", | |
bnb_4bit_use_double_quant=True | |
) | |
llm = HuggingFaceEndpoint( | |
repo_id="mistralai/Mistral-Nemo-Instruct-2407", #"mistralai/Mistral-7B-Instruct-v0.3", | |
task="text-generation", | |
model_kwargs={"quantization_config": quantization_config}, | |
max_new_tokens=4096, | |
temperature=0.5, | |
do_sample=False, | |
) | |
#llm_engine_hf = ChatHuggingFace(llm=llm) | |
def read_pdf(file_path): | |
logger.info("Reading a PDF file") | |
try: | |
pdf_document = fitz.open(file_path) | |
text = "" | |
for page_num in range(len(pdf_document)): | |
page = pdf_document[page_num] | |
text += page.get_text() | |
if not text.strip(): | |
message = "PDF contains no text. It may be due to the PDF being password-protected, collapsed, or full of images." | |
logger.info(message) | |
return message | |
return text | |
except Exception as e: | |
error_message = f"Error reading PDF file: {e}" | |
logger.error(error_message) | |
return error_message | |
def read_txt(file_path): | |
logger.info("Reading a TXT file") | |
try: | |
with open(file_path, "r", encoding="utf-8") as f: | |
text = f.read() | |
return text | |
except Exception as e: | |
error_message = f"Error reading TXT file: {e}" | |
logger.error(error_message) | |
return error_message | |
def summarize(file): | |
global llm | |
# Read the content of the uploaded file | |
file_path = file.name | |
if file_path.endswith('.pdf'): | |
text = read_pdf(file_path) | |
else: | |
text = read_txt(file_path) | |
logger.info("Length of text is %d", len(text)) | |
lang = detect(text[:CONTEXT_WINDOW]) | |
template_translate = ''' | |
Please carefully read the following document: | |
<document> | |
{TEXT} | |
</document> | |
After reading through the document, pinpoint the key points and main ideas covered in the text. | |
Organize these key points into a concise bulleted list that summarizes the essential information from the document. | |
The summary should be in {LANG} language. | |
''' | |
prompt_summarize = PromptTemplate( | |
template=template_translate, | |
input_variables=["TEXT", "LANG"] | |
) | |
summaries = [] | |
for i in range(0, len(text), CONTEXT_WINDOW): | |
chunk = text[i:i + CONTEXT_WINDOW] | |
formatted_prompt = prompt_summarize.format(TEXT=chunk, LANG=lang) | |
summary = llm.invoke(formatted_prompt) | |
summaries.append(summary) | |
logger.info(f"Chunked into {len(summaries)}.") | |
final_summary = "\n\n".join(summaries) | |
return final_summary | |
def download_summary(output_text): | |
if output_text: | |
file_path = Path('summary.txt') | |
with open(file_path, 'w', encoding='utf-8') as f: | |
f.write(output_text) | |
return file_path | |
else: | |
return None | |
def create_download_file(summary_text): | |
file_path = download_summary(summary_text) | |
return str(file_path) if file_path else None | |
# Create the Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("## Document Summarizer") | |
with gr.Row(): | |
with gr.Column(): | |
file = gr.File(label="Submit a file") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Summary", lines=20) | |
submit_button = gr.Button("Summarize") | |
submit_button.click(summarize, inputs=[file], outputs=output_text) | |
def generate_file(): | |
summary_text = output_text | |
file_path = download_summary(summary_text) | |
return file_path | |
download_button = gr.Button("Download Summary") | |
download_button.click( | |
fn=create_download_file, | |
inputs=[output_text], | |
outputs=gr.File() | |
) | |
# Run the Gradio app | |
demo.launch(share=True) |