import os import whisper from groq import Groq from diffusers import StableDiffusionPipeline import gradio as gr import torch # Load Whisper model whisper_model = whisper.load_model("base") GROQ_API_KEY="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" client=Groq(api_key=GROQ_API_KEY) # Load Stable Diffusion pipeline device = "cuda" if torch.cuda.is_available() else "cpu" stable_diffusion_model = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5" ).to(device) # Function to handle voice-to-image pipeline def voice_to_image(audio): # Step 1: Transcribe audio to text using Whisper transcription = whisper_model.transcribe(audio) input_text = transcription["text"] # Step 2: Query LLM using Groq API chat_completion = client.chat.completions.create( messages=[ {"role": "user", "content": input_text}, ], model="llama3-8b-8192", stream=False, ) response_text = chat_completion.choices[0].message.content # Step 3: Generate image using Stable Diffusion image = stable_diffusion_model(response_text).images[0] return image # Gradio Interface interface = gr.Interface( fn=voice_to_image, inputs=gr.Audio(type="filepath"), outputs="image", title="Real-Time Voice-to-Image Generator", description="Transcribe voice input into an image using Whisper, Groq LLM, and Stable Diffusion." ) # Launch Gradio app interface.launch()