import torch import gradio as gr from transformers import T5ForConditionalGeneration, T5Tokenizer import fitz # PyMuPDF model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx' model = T5ForConditionalGeneration.from_pretrained(model_path) tokenizer = T5Tokenizer.from_pretrained(model_path) def extract_text_from_pdf(pdf_path): text = "" with fitz.open(pdf_path) as doc: for page in doc: text += page.get_text() return text def summarize_text(text, max_length, show_length): inputs = tokenizer.encode( "summarize: " + text, return_tensors='pt', max_length=512, truncation=True, padding='max_length' ) summary_ids = model.generate( inputs, max_length=max_length + 20, min_length=10, num_beams=5, no_repeat_ngram_size=2, early_stopping=True ) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) summary_words = summary.split() if len(summary_words) > max_length: summary = ' '.join(summary_words[:max_length]) elif len(summary_words) < max_length: additional_tokens = model.generate( tokenizer.encode(" ".join(summary_words), return_tensors='pt'), max_length=max_length - len(summary_words) + len(summary_words), min_length=max_length - len(summary_words) + len(summary_words), num_beams=5, no_repeat_ngram_size=2, early_stopping=True ) additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True) summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length]) if show_length: summary_length = len(summary.split()) summary = f"{summary}\n\n(Summary length: {summary_length} words)" return summary def handle_pdf(pdf, max_length, show_length): text = extract_text_from_pdf(pdf.name) return summarize_text(text, max_length, show_length) interface = gr.Interface( fn=handle_pdf, inputs=[ gr.File(label='Upload PDF', type='file'), gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'), gr.Checkbox(label='Show summary length', value=False) ], outputs=gr.Textbox(label='Summarized Text'), title='PDF Text Summarizer using T5-finetuned-dialogue_sumxx' ) interface.launch()