File size: 3,534 Bytes
f1691d8
 
 
026783f
0dd7ae7
162dd8b
 
 
f1691d8
162dd8b
 
f1691d8
0dd7ae7
f1691d8
 
 
8639815
f1691d8
 
 
162dd8b
026783f
 
 
 
 
 
 
162dd8b
 
 
026783f
 
 
 
 
 
f1691d8
 
 
 
 
 
b9c9dac
f1691d8
 
b9c9dac
f1691d8
 
 
 
 
 
 
 
 
 
 
b9c9dac
f1691d8
 
 
 
162dd8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8639815
162dd8b
 
b9c9dac
162dd8b
f1691d8
 
 
 
 
162dd8b
 
 
 
 
 
 
f1691d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import PyPDF2
import gradio as gr
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_core.documents import Document
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=4096,
    do_sample=False,
)
llm_engine_hf = ChatHuggingFace(llm=llm)

def read_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()
    return text
    
def summarize(file, n_words):
    # Read the content of the uploaded file
    file_path = file.name
    if file_path.endswith('.pdf'):
        file_content = read_pdf(file_path)
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            file_content = f.read()
            
    document = Document(file_content)
    # Generate the summary
    text = document.page_content
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
    chunks = text_splitter.create_documents([text])
    n_words = n_words
    template = '''
    Your task is to summarize a long text into a concise summary of a specific number of words. 

    The summary you generate must be very detailed. 

    Before writing your final summary, first break down the key points of the text in a <scratchpad>. Identify the most important information that should be included in a summary of the specified length.

    Then, write a summary that captures the core ideas and key details of the text. Start with an introductory sentence and then concisely summarize the main points in a logical order. Make sure to stay within the {{N_WORDS}} word limit.

    Here is the long text to summarize:
    Text: 
    {TEXT}
    '''
    prompt = PromptTemplate(
        template=template,
        input_variables=['TEXT']    
    )
    formatted_prompt = prompt.format(TEXT=text, N_WORDS=n_words)
    output_summary = llm_engine_hf.invoke(formatted_prompt)
    return output_summary.content

def download_summary(output_text):
    if output_text:
        file_path = Path('summary.txt')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(output_text)
        return file_path
    else:
        return None
def create_download_file(summary_text):
    file_path = download_summary(summary_text)
    return str(file_path) if file_path else None

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Document Summarizer")

    with gr.Row():
        with gr.Column():
            file = gr.File(label="Submit a file")
        
        with gr.Column():
            output_text = gr.Textbox(label="Summary", lines=20)

    submit_button = gr.Button("Summarize")
    submit_button.click(summarize, inputs=[file], outputs=output_text)

    def generate_file():
        summary_text = output_text
        file_path = download_summary(summary_text)
        return file_path

    download_button = gr.Button("Download Summary")
    download_button.click(
        fn=create_download_file,
        inputs=[output_text],
        outputs=gr.File()
    )
# Run the Gradio app
demo.launch(share=True)