img2txt / app.py
namelessai's picture
Update app.py
85a339a verified
import os
import gradio as gr
from transformers import pipeline
from huggingface_hub import InferenceClient
from PIL import Image
import io
# Get the token from environment variable
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable is not set. Please set it with your Hugging Face API token.")
# Initialize both local pipeline and Inference Client
local_extractor = pipeline("image-to-text", model="microsoft/git-base-textcaps")
api_client = InferenceClient(model="microsoft/git-base-textcaps", token=hf_token)
# Flag to track which mode is active
use_api = False
def switch_mode():
global use_api
use_api = not use_api
return "Using API" if use_api else "Using Local Model (Slow unless duplicated and run on GPU"
def extract_text(image, mode_indicator):
# Convert image to PNG if it's not already
if image.format != 'PNG':
png_buffer = io.BytesIO()
image.save(png_buffer, format='PNG')
png_image = Image.open(png_buffer)
else:
png_image = image
if "API" in mode_indicator:
# Convert PIL Image to bytes
buffered = io.BytesIO()
png_image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
result = api_client.image_to_text(image=img_bytes)
else:
result = local_extractor(png_image)
return result[0]['generated_text'] if isinstance(result, list) else result
# Create the Gradio interface
with gr.Blocks() as iface:
gr.Markdown("# Image Text Extractor")
with gr.Row():
image_input = gr.Image(type="pil")
text_output = gr.Textbox()
mode_button = gr.Button("Switch Mode")
mode_indicator = gr.Textbox(value="Using Local Model", label="Current Mode")
mode_button.click(switch_mode, outputs=mode_indicator)
image_input.change(extract_text, inputs=[image_input, mode_indicator], outputs=text_output)
iface.launch()