Mistral-7B-Summarizer-v2

Sleeping

File size: 3,384 Bytes

f1691d8
 
 
026783f
0dd7ae7
162dd8b
 
f1691d8
b975282
0dd7ae7
f1691d8
d100ccb
a1758a8
8639815
f6a07f3
f1691d8
 
 
162dd8b
026783f
 
 
 
 
 
 
162dd8b
b975282
162dd8b
 
026783f
241247a
026783f
 
241247a
026783f
b975282
d3aa346
ec98a9a
31eecd9
ec98a9a
31eecd9
ec98a9a
9be919d
d3aa346
241247a
b975282
 
b9c9dac
f1691d8
241247a
b975282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dcae0c
b975282
162dd8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8639815
162dd8b
 
b9c9dac
162dd8b
f1691d8
 
 
 
 
162dd8b
 
 
 
 
 
 
f1691d8

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import PyPDF2
import gradio as gr
from langchain.prompts import PromptTemplate
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_core.output_parsers import JsonOutputParser

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=4096,
    temperature=0.5,
    do_sample=False,
)
llm_engine_hf = ChatHuggingFace(llm=llm)

def read_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()
    return text
    
def summarize(file, n_words):
    global llm
    # Read the content of the uploaded file
    file_path = file.name
    if file_path.endswith('.pdf'):
        text = read_pdf(file_path)
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            
    template_detect = '''
Please carefully read the following document:

<document>
{TEXT}
</document>

identify the MOST used language in the document, return detected language in json format with key "language" and value is the detected language
'''
    
    prompt_detect = PromptTemplate(
        template=template_detect,
        input_variables=['TEXT']    
    )
    
    language_detect = prompt_detect | llm | JsonOutputParser()
    formatted_prompt = prompt_detect.format(TEXT=text)
    language = language_detect.invoke(formatted_prompt)

    lang = language["language"]
    template_translate = '''
Please carefully read the following document:
<document>
{TEXT}
</document>
After reading through the document, pinpoint the key points and main ideas covered in the text. 
Organize these key points into a concise bulleted list that summarizes the essential information from the document. 
The summary should be in {LANG} language.
'''
    
    prompt_summarize = PromptTemplate(
        template=template_translate,
        input_variables=["TEXT", "LANG"]
    )
    formatted_prompt = prompt_summarize.format(TEXT=text, LANG=lang)
    summary = llm.invoke(formatted_prompt)
    
    return summary

def download_summary(output_text):
    if output_text:
        file_path = Path('summary.txt')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(output_text)
        return file_path
    else:
        return None
def create_download_file(summary_text):
    file_path = download_summary(summary_text)
    return str(file_path) if file_path else None

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Document Summarizer")

    with gr.Row():
        with gr.Column():
            file = gr.File(label="Submit a file")
        
        with gr.Column():
            output_text = gr.Textbox(label="Summary", lines=20)

    submit_button = gr.Button("Summarize")
    submit_button.click(summarize, inputs=[file], outputs=output_text)

    def generate_file():
        summary_text = output_text
        file_path = download_summary(summary_text)
        return file_path

    download_button = gr.Button("Download Summary")
    download_button.click(
        fn=create_download_file,
        inputs=[output_text],
        outputs=gr.File()
    )
# Run the Gradio app
demo.launch(share=True)