File size: 4,307 Bytes
f1691d8
 
a43e6d5
 
 
f1691d8
c97532f
0dd7ae7
162dd8b
 
f1691d8
bf3ac4c
367b557
e50a70a
0dd7ae7
e50a70a
 
 
 
 
 
f1691d8
e292744
a1758a8
e50a70a
8639815
f6a07f3
f1691d8
 
9e89ef8
162dd8b
026783f
a43e6d5
28f0884
c97532f
28f0884
c97532f
 
 
 
a43e6d5
 
 
 
 
28f0884
a43e6d5
28f0884
a43e6d5
 
 
28f0884
a84c0e0
a43e6d5
 
 
 
 
 
 
 
 
a84c0e0
9e89ef8
b975282
162dd8b
 
026783f
241247a
026783f
a84c0e0
dc3c24e
a43e6d5
 
dc3c24e
b975282
 
 
 
 
 
 
 
 
 
 
 
 
 
1dcae0c
65033bf
 
 
 
 
 
a43e6d5
 
65033bf
 
 
162dd8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8639815
162dd8b
 
b9c9dac
162dd8b
f1691d8
 
 
 
 
162dd8b
 
 
 
 
 
 
f1691d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import fitz
import gradio as gr
from langchain.prompts import PromptTemplate
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langdetect import detect
CONTEXT_WINDOW = 50_000
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True
)
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-Nemo-Instruct-2407", #"mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    model_kwargs={"quantization_config": quantization_config},
    max_new_tokens=4096,
    temperature=0.5,
    do_sample=False,
)
#llm_engine_hf = ChatHuggingFace(llm=llm)

def read_pdf(file_path):
    logger.info("Reading a PDF file")
    try:
        pdf_document = fitz.open(file_path)
        text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text += page.get_text()
        
        if not text.strip():
            message = "PDF contains no text. It may be due to the PDF being password-protected, collapsed, or full of images."
            logger.info(message)
            return message
        
        return text
        
    except Exception as e:
        error_message = f"Error reading PDF file: {e}"
        logger.error(error_message)
        return error_message
        
def read_txt(file_path):
    logger.info("Reading a TXT file")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        return text
    except Exception as e:
        error_message = f"Error reading TXT file: {e}"
        logger.error(error_message)
        return error_message

def summarize(file):
    global llm
    # Read the content of the uploaded file
    file_path = file.name
    if file_path.endswith('.pdf'):
        text = read_pdf(file_path)
    else:
        text = read_txt(file_path)

    logger.info("Length of text is %d", len(text))
                
    lang = detect(text[:CONTEXT_WINDOW])
    template_translate = '''
Please carefully read the following document:
<document>
{TEXT}
</document>
After reading through the document, pinpoint the key points and main ideas covered in the text. 
Organize these key points into a concise bulleted list that summarizes the essential information from the document. 
The summary should be in {LANG} language.
'''
    
    prompt_summarize = PromptTemplate(
        template=template_translate,
        input_variables=["TEXT", "LANG"]
    )
    
    summaries = []
    for i in range(0, len(text), CONTEXT_WINDOW):
        chunk = text[i:i + CONTEXT_WINDOW]
        formatted_prompt = prompt_summarize.format(TEXT=chunk, LANG=lang)
        summary = llm.invoke(formatted_prompt)
        summaries.append(summary)

    logger.info(f"Chunked into {len(summaries)}.")
    
    final_summary = "\n\n".join(summaries)
    return final_summary

def download_summary(output_text):
    if output_text:
        file_path = Path('summary.txt')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(output_text)
        return file_path
    else:
        return None
def create_download_file(summary_text):
    file_path = download_summary(summary_text)
    return str(file_path) if file_path else None

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Document Summarizer")

    with gr.Row():
        with gr.Column():
            file = gr.File(label="Submit a file")
        
        with gr.Column():
            output_text = gr.Textbox(label="Summary", lines=20)

    submit_button = gr.Button("Summarize")
    submit_button.click(summarize, inputs=[file], outputs=output_text)

    def generate_file():
        summary_text = output_text
        file_path = download_summary(summary_text)
        return file_path

    download_button = gr.Button("Download Summary")
    download_button.click(
        fn=create_download_file,
        inputs=[output_text],
        outputs=gr.File()
    )
# Run the Gradio app
demo.launch(share=True)