import torch from PIL import Image from transformers import ( AutoProcessor, AutoModelForVision2Seq, Wav2Vec2ForCTC, Wav2Vec2Processor, ) import numpy as np import gradio as gr import librosa from gradio.themes import Citrus # Set the device (GPU or CPU) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {DEVICE}") # Initialize processor and model try: processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") model = AutoModelForVision2Seq.from_pretrained( "HuggingFaceTB/SmolVLM-Instruct", torch_dtype=torch.bfloat16, _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager", ).to(DEVICE) stt_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(DEVICE) except Exception as e: print(f"Error loading model or processor: {str(e)}") exit(1) # Define the function to convert speech to text def speech_to_text(audio): try: # Load audio audio, rate = librosa.load(audio, sr=16000) input_values = stt_processor( audio, return_tensors="pt", sampling_rate=16000 ).input_values.to(DEVICE) logits = stt_model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = stt_processor.decode(predicted_ids[0]) print(f"Detected text: {transcription}") return transcription except Exception as e: return f"Error: Unable to process the audio. {str(e)}" # Define the function to answer questions def answer_question(image, question, audio): # Convert speech to text if audio is provided if audio is not None: question = speech_to_text(audio) # Check if the image is provided if image is None: return "Error: Please upload an image." # Convert NumPy array to PIL Image try: if isinstance(image, np.ndarray): image = Image.fromarray(image) except Exception as e: return f"Error: Unable to process the image. {str(e)}" # Ensure question is provided if not question.strip(): return "Error: Please provide a question." # Create input message for the model messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": question}, ], }, ] # Apply chat template and prepare inputs try: prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE) except Exception as e: return f"Error: Failed to prepare inputs. {str(e)}" # Generate answer try: outputs = model.generate(**inputs, max_new_tokens=400) answer = processor.decode(outputs[0], skip_special_tokens=True) return answer except Exception as e: return f"Error: Failed to generate answer. {str(e)}" # Customize the Citrus theme with a specific neutral_hue custom_citrus = Citrus(neutral_hue="slate") # Define your Gradio interface iface = gr.Interface( fn=answer_question, inputs=[ gr.Image(type="numpy", value="faam_to_the_future.jpg"), gr.Textbox(lines=2, placeholder="Enter your question here..."), gr.Audio( type="filepath", sources="microphone", label="Upload a recording or record a question", ), ], outputs="text", title="FAAM-demo | Vision Language Model | SmolVLM", description="Upload an image and ask questions about it", theme=custom_citrus, ) # Launch the interface iface.launch()