import os import gradio as gr from transformers import pipeline from huggingface_hub import InferenceClient from PIL import Image import io # Get the token from environment variable hf_token = os.environ.get("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN environment variable is not set. Please set it with your Hugging Face API token.") # Initialize both local pipeline and Inference Client local_extractor = pipeline("image-to-text", model="microsoft/git-base-textcaps") api_client = InferenceClient(model="microsoft/git-base-textcaps", token=hf_token) # Flag to track which mode is active use_api = False def switch_mode(): global use_api use_api = not use_api return "Using API" if use_api else "Using Local Model (Slow unless duplicated and run on GPU" def extract_text(image, mode_indicator): # Convert image to PNG if it's not already if image.format != 'PNG': png_buffer = io.BytesIO() image.save(png_buffer, format='PNG') png_image = Image.open(png_buffer) else: png_image = image if "API" in mode_indicator: # Convert PIL Image to bytes buffered = io.BytesIO() png_image.save(buffered, format="PNG") img_bytes = buffered.getvalue() result = api_client.image_to_text(image=img_bytes) else: result = local_extractor(png_image) return result[0]['generated_text'] if isinstance(result, list) else result # Create the Gradio interface with gr.Blocks() as iface: gr.Markdown("# Image Text Extractor") with gr.Row(): image_input = gr.Image(type="pil") text_output = gr.Textbox() mode_button = gr.Button("Switch Mode") mode_indicator = gr.Textbox(value="Using Local Model", label="Current Mode") mode_button.click(switch_mode, outputs=mode_indicator) image_input.change(extract_text, inputs=[image_input, mode_indicator], outputs=text_output) iface.launch()