File size: 2,167 Bytes
1f2b1df
df0e0ab
1f2b1df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df0e0ab
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from transformers import pipeline

import fitz 
import gradio as gr
import requests
import io
import re
import os
from PIL import Image

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

os.environ["HUGGINGFACE_HUB_TOKEN"] = "ctp-hw"
my_key = os.environ["HUGGINGFACE_HUB_TOKEN"]

def extract_text_from_pdf(pdf_file):
    with fitz.open(pdf_file) as pdf:
        text = ""
        for page in pdf:
            text += page.get_text("text")

    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def summarize_pdf(pdf_file):
    text = extract_text_from_pdf(pdf_file)
    
    if len(text) > 1000:
        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
        summary = ""
        for chunk in chunks:
            summary += summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] + " "
    else:
        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
    
    return summary

def answer_question(pdf_file, question):
    text = extract_text_from_pdf(pdf_file)
    answer = qa_model(question=question, context=text)
    return answer['answer']

API_URL = "https://api-inference.huggingface.co/models/stable-diffusion-v1-5/stable-diffusion-v1-5"
headers = {"Authorization": f"Bearer {my_key}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.content

def summarize_and_qa(pdf_file, question):
    summary = summarize_pdf(pdf_file)
    answer = answer_question(pdf_file, question)
    image_bytes = query({
        "inputs": answer,
    })
    image = Image.open(io.BytesIO(image_bytes))
    return summary, answer, image

gr.Interface(
    fn=summarize_and_qa,
    inputs=["file", "text"],
    outputs=["textbox", "textbox", "image"],
    title="PDF Summary and Q&A",
    description="Upload a PDF to get a summary and answer questions based on the content. It will also give a picture to help you better understand the content."
).launch()

if __name__ == "__main__":
    demo.launch()