Spaces:
Sleeping
Sleeping
import torch | |
from PIL import Image | |
from transformers import ( | |
AutoProcessor, | |
AutoModelForVision2Seq, | |
Wav2Vec2ForCTC, | |
Wav2Vec2Processor, | |
) | |
import numpy as np | |
import gradio as gr | |
import librosa | |
from gradio.themes import Citrus | |
# Set the device (GPU or CPU) | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {DEVICE}") | |
# Initialize processor and model | |
try: | |
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") | |
model = AutoModelForVision2Seq.from_pretrained( | |
"HuggingFaceTB/SmolVLM-Instruct", | |
torch_dtype=torch.bfloat16, | |
_attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager", | |
).to(DEVICE) | |
stt_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") | |
stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(DEVICE) | |
except Exception as e: | |
print(f"Error loading model or processor: {str(e)}") | |
exit(1) | |
# Define the function to convert speech to text | |
def speech_to_text(audio): | |
try: | |
# Load audio | |
audio, rate = librosa.load(audio, sr=16000) | |
input_values = stt_processor( | |
audio, return_tensors="pt", sampling_rate=16000 | |
).input_values.to(DEVICE) | |
logits = stt_model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = stt_processor.decode(predicted_ids[0]) | |
print(f"Detected text: {transcription}") | |
return transcription | |
except Exception as e: | |
return f"Error: Unable to process the audio. {str(e)}" | |
# Define the function to answer questions | |
def answer_question(image, question, audio): | |
# Convert speech to text if audio is provided | |
if audio is not None: | |
question = speech_to_text(audio) | |
# Check if the image is provided | |
if image is None: | |
return "Error: Please upload an image." | |
# Convert NumPy array to PIL Image | |
try: | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
except Exception as e: | |
return f"Error: Unable to process the image. {str(e)}" | |
# Ensure question is provided | |
if not question.strip(): | |
return "Error: Please provide a question." | |
# Create input message for the model | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image", "image": image}, | |
{"type": "text", "text": question}, | |
], | |
}, | |
] | |
# Apply chat template and prepare inputs | |
try: | |
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE) | |
except Exception as e: | |
return f"Error: Failed to prepare inputs. {str(e)}" | |
# Generate answer | |
try: | |
outputs = model.generate(**inputs, max_new_tokens=400) | |
answer = processor.decode(outputs[0], skip_special_tokens=True) | |
return answer | |
except Exception as e: | |
return f"Error: Failed to generate answer. {str(e)}" | |
# Customize the Citrus theme with a specific neutral_hue | |
custom_citrus = Citrus(neutral_hue="slate") | |
# Define your Gradio interface | |
iface = gr.Interface( | |
fn=answer_question, | |
inputs=[ | |
gr.Image(type="numpy", value="faam_to_the_future.jpg"), | |
gr.Textbox(lines=2, placeholder="Enter your question here..."), | |
gr.Audio( | |
type="filepath", | |
sources="microphone", | |
label="Upload a recording or record a question", | |
), | |
], | |
outputs="text", | |
title="FAAM-demo | Vision Language Model | SmolVLM", | |
description="Upload an image and ask questions about it", | |
theme=custom_citrus, | |
) | |
# Launch the interface | |
iface.launch() | |