File size: 3,697 Bytes
f1691d8
 
 
026783f
0dd7ae7
162dd8b
 
 
f1691d8
162dd8b
 
f1691d8
0dd7ae7
f1691d8
 
 
 
 
 
 
162dd8b
026783f
 
 
 
 
 
 
162dd8b
 
 
026783f
 
 
 
 
 
f1691d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162dd8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1691d8
162dd8b
 
 
f1691d8
162dd8b
 
 
 
f1691d8
 
 
 
 
162dd8b
 
 
 
 
 
 
f1691d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import PyPDF2
import gradio as gr
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_core.documents import Document
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=1025,
    do_sample=False,
)
llm_engine_hf = ChatHuggingFace(llm=llm)

def read_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()
    return text
    
def summarize(file, n_words):
    # Read the content of the uploaded file
    file_path = file.name
    if file_path.endswith('.pdf'):
        file_content = read_pdf(file_path)
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            file_content = f.read()
            
    document = Document(file_content)
    # Generate the summary
    text = document.page_content
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
    chunks = text_splitter.create_documents([text])
    n_words = n_words
    template = ''' [INST]
    Your task is to summarize a long text into a concise summary of a specific number of words. 

    The summary you generate must be EXACTLY {N_WORDS} words long. 

    Before writing your final summary, first break down the key points of the text in a <scratchpad>. Identify the most important information that should be included in a summary of the specified length.

    Then, write a summary that captures the core ideas and key details of the text. Start with an introductory sentence and then concisely summarize the main points in a logical order. Make sure to stay within the {{N_WORDS}} word limit.

    Here is the long text to summarize:
    Text: 
    {TEXT}


    [/INST]
    '''
    prompt = PromptTemplate(
        template=template,
        input_variables=['TEXT', "N_WORDS"]    
    )
    formatted_prompt = prompt.format(TEXT=text, N_WORDS=n_words)
    output_summary = llm_engine_hf.invoke(formatted_prompt)
    return output_summary.content

def download_summary(output_text):
    if output_text:
        file_path = Path('summary.txt')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(output_text)
        return file_path
    else:
        return None
def create_download_file(summary_text):
    file_path = download_summary(summary_text)
    return str(file_path) if file_path else None

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Document Summarizer")

    with gr.Row():
        with gr.Column():
            n_words = gr.Slider(minimum=50, maximum=500, step=50, label="Number of words (approximately)")
            file = gr.File(label="Submit a file")
        
        with gr.Column():
            output_text = gr.Textbox(label="Summary", lines=20)

    submit_button = gr.Button("Summarize")
    submit_button.click(summarize, inputs=[file, n_words], outputs=output_text)

    def generate_file():
        summary_text = output_text
        file_path = download_summary(summary_text)
        return file_path

    download_button = gr.Button("Download Summary")
    download_button.click(
        fn=create_download_file,
        inputs=[output_text],
        outputs=gr.File()
    )
# Run the Gradio app
demo.launch(share=True)