Spaces:
Sleeping
Sleeping
import torch | |
import gradio as gr | |
from transformers import T5ForConditionalGeneration, T5Tokenizer | |
import fitz # PyMuPDF | |
model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx' | |
model = T5ForConditionalGeneration.from_pretrained(model_path) | |
tokenizer = T5Tokenizer.from_pretrained(model_path) | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
with fitz.open(pdf_path) as doc: | |
for page in doc: | |
text += page.get_text() | |
return text | |
def summarize_text(text, max_length, show_length): | |
inputs = tokenizer.encode( | |
"summarize: " + text, | |
return_tensors='pt', | |
max_length=512, | |
truncation=True, | |
padding='max_length' | |
) | |
summary_ids = model.generate( | |
inputs, | |
max_length=max_length + 20, | |
min_length=10, | |
num_beams=5, | |
no_repeat_ngram_size=2, | |
early_stopping=True | |
) | |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
summary_words = summary.split() | |
if len(summary_words) > max_length: | |
summary = ' '.join(summary_words[:max_length]) | |
elif len(summary_words) < max_length: | |
additional_tokens = model.generate( | |
tokenizer.encode(" ".join(summary_words), return_tensors='pt'), | |
max_length=max_length - len(summary_words) + len(summary_words), | |
min_length=max_length - len(summary_words) + len(summary_words), | |
num_beams=5, | |
no_repeat_ngram_size=2, | |
early_stopping=True | |
) | |
additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True) | |
summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length]) | |
if show_length: | |
summary_length = len(summary.split()) | |
summary = f"{summary}\n\n(Summary length: {summary_length} words)" | |
return summary | |
def handle_pdf(pdf, max_length, show_length): | |
text = extract_text_from_pdf(pdf.name) | |
return summarize_text(text, max_length, show_length) | |
interface = gr.Interface( | |
fn=handle_pdf, | |
inputs=[ | |
gr.File(label='Upload PDF', type='file'), | |
gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'), | |
gr.Checkbox(label='Show summary length', value=False) | |
], | |
outputs=gr.Textbox(label='Summarized Text'), | |
title='PDF Text Summarizer using T5-finetuned-dialogue_sumxx' | |
) | |
interface.launch() | |