Talk2Deck / app.py
os1187's picture
Update app.py
fa7dedf verified
import os
import gradio as gr
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
from transformers import AutoTokenizer, AutoModel
import torch
from pdf2image import convert_from_path
import io
from io import BytesIO
# CSS styles
css = """
.button {
padding: 10px 20px;
background: #007BFF;
color: white;
border: none;
cursor: pointer;
font-size: 16px;
margin: 10px;
}
"""
# Define layout with custom styles
layout = [
gr.Row([gr.File(label="Upload PDF", type="binary")]), # Corrected 'type' parameter
gr.Row([gr.Button("Generate Insights")]),
gr.Row([gr.Textbox("Placeholder for PDF insights", label="Insights", type="text")])
]
# Function to get image embeddings using ViT
def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model = ViTModel.from_pretrained(model_name)
image = Image.open(image_path)
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
return embeddings
# Function to convert PDF to images
def pdf_to_images(pdf_file, img_dir):
images = convert_from_path(pdf_file)
# Create the directory if it doesn't exist
os.makedirs(img_dir, exist_ok=True)
for i, image in enumerate(images):
image_path = f"{img_dir}/page_{i + 1}.png"
image.save(image_path, "PNG")
print(f"Converted {len(images)} pages to images and saved in {img_dir}")
# Function to get text embeddings using a transformer model
def get_text_embeddings(text, model_name='bert-base-uncased'):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
return embeddings
# Function to process PDF and generate a response
def process_pdf_and_generate_response(pdf_file):
try:
# Convert the binary stream to a file-like object
pdf_file_stream = BytesIO(pdf_file)
# Convert PDF to images
img_dir = "pdf_images"
pdf_to_images(pdf_file_stream, img_dir)
# Generate embeddings for each image
image_embeddings = []
for filename in os.listdir(img_dir):
if filename.endswith(".png"):
image_path = os.path.join(img_dir, filename)
image_embeddings.append(get_image_embeddings(image_path))
# Perform some text analysis on the PDF content (replace with your logic)
pdf_text = "PDF content analysis placeholder"
text_embeddings = get_text_embeddings(pdf_text)
# Combine image and text embeddings and generate a response (replace with your logic)
combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0)
response = "Response based on the processed PDF"
except Exception as e:
response = f"An error occurred: {str(e)}"
return response
iface = gr.Interface(
fn=process_pdf_and_generate_response,
inputs=gr.File(label="Upload PDF", type="binary"), # Corrected 'type' parameter
outputs=gr.Textbox("Placeholder for PDF insights", label="Insights", type="text"),
title="pdf-chatbot",
description="Upload a PDF and receive insights based on its content.",
css=css
)
if __name__ == "__main__":
iface.launch()