Spaces:
Sleeping
Sleeping
File size: 3,311 Bytes
f1691d8 026783f 0dd7ae7 162dd8b f1691d8 b975282 bf3ac4c dc3c24e 0dd7ae7 f1691d8 d100ccb a1758a8 8639815 f6a07f3 f1691d8 162dd8b 026783f 28f0884 a84c0e0 28f0884 a84c0e0 162dd8b b975282 162dd8b 026783f 241247a 026783f a84c0e0 dc3c24e 026783f dc3c24e b975282 1dcae0c b975282 162dd8b 8639815 162dd8b b9c9dac 162dd8b f1691d8 162dd8b f1691d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import PyPDF2
import gradio as gr
from langchain.prompts import PromptTemplate
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_core.output_parsers import JsonOutputParser
from langdetect import detect
CONTEXT_WINDOW = 31_750
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
task="text-generation",
max_new_tokens=4096,
temperature=0.5,
do_sample=False,
)
llm_engine_hf = ChatHuggingFace(llm=llm)
def read_pdf(file_path):
print("It is a PDF file")
try:
pdf_reader = PyPDF2.PdfReader(file_path)
text = ""
for page in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page].extract_text()
return text
except Exception as e:
print("Error reading file, ", e)
def read_txt(file_path):
print("It is not a PDF file")
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
return text
def summarize(file, n_words):
global llm
# Read the content of the uploaded file
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
text = read_txt(file_path)
print("Length of text is ", len(text))
if len(text) > CONTEXT_WINDOW:
print(f"Slicing the first {CONTEXT_WINDOW} characters")
text = text[:CONTEXT_WINDOW]
lang = detect(text[:CONTEXT_WINDOW])
template_translate = '''
Please carefully read the following document:
<document>
{TEXT}
</document>
After reading through the document, pinpoint the key points and main ideas covered in the text.
Organize these key points into a concise bulleted list that summarizes the essential information from the document.
The summary should be in {LANG} language.
'''
prompt_summarize = PromptTemplate(
template=template_translate,
input_variables=["TEXT", "LANG"]
)
formatted_prompt = prompt_summarize.format(TEXT=text, LANG=lang)
summary = llm.invoke(formatted_prompt)
return summary
def download_summary(output_text):
if output_text:
file_path = Path('summary.txt')
with open(file_path, 'w', encoding='utf-8') as f:
f.write(output_text)
return file_path
else:
return None
def create_download_file(summary_text):
file_path = download_summary(summary_text)
return str(file_path) if file_path else None
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Document Summarizer")
with gr.Row():
with gr.Column():
file = gr.File(label="Submit a file")
with gr.Column():
output_text = gr.Textbox(label="Summary", lines=20)
submit_button = gr.Button("Summarize")
submit_button.click(summarize, inputs=[file], outputs=output_text)
def generate_file():
summary_text = output_text
file_path = download_summary(summary_text)
return file_path
download_button = gr.Button("Download Summary")
download_button.click(
fn=create_download_file,
inputs=[output_text],
outputs=gr.File()
)
# Run the Gradio app
demo.launch(share=True) |