import PyPDF2 import gradio as gr import json from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline # Função para extrair texto do PDF def extract_text_from_pdf(pdf_file): reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text # Função para gerar perguntas usando um modelo da Hugging Face def generate_questions(text): tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl") model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl") inputs = tokenizer.encode("generate questions: " + text, return_tensors="pt", max_length=512, truncation=True) outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True) questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs] return questions # Função para responder perguntas usando um pipeline de perguntas e respostas def answer_questions(context, questions): qa_pipeline = pipeline("question-answering") qas = [] for question in questions: answer = qa_pipeline(question=question, context=context) qas.append({ "question": question, "answer": answer['answer'], "answer_start": answer['start'] }) return qas # Função para converter os pares de QA no formato SQuAD def convert_to_squad_format(qas, context): squad_data = [] for i, qa in enumerate(qas): entry = { "title": "Generated Data", "context": context, "question": qa['question'], "id": str(i), "answers": { "answer_start": [qa['answer_start']], "text": [qa['answer']] } } squad_data.append(entry) return squad_data # Função para salvar os dados no formato SQuAD def save_to_json(data, file_name): if not file_name.endswith(".json"): file_name += ".json" with open(file_name, "w", encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) return file_name # Função principal para ser usada no Gradio def process_pdf(pdf_file, file_name): context = extract_text_from_pdf(pdf_file) questions = generate_questions(context) qas = answer_questions(context, questions) squad_data = convert_to_squad_format(qas, context) file_path = save_to_json(squad_data, file_name) return file_path # Interface Gradio with gr.Blocks() as demo: with gr.Row(): pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"]) file_name = gr.Textbox(label="Output JSON File Name", value="squad_dataset") process_button = gr.Button("Process PDF") download_link = gr.File(label="Download JSON", interactive=False) process_button.click(fn=process_pdf, inputs=[pdf_file, file_name], outputs=download_link) demo.launch()