Shahabmoin commited on
Commit
d52ffdf
·
verified ·
1 Parent(s): 6e17af0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -0
app.py CHANGED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import whisper
3
+ from groq import Groq
4
+ from diffusers import StableDiffusionPipeline
5
+ import gradio as gr
6
+ import torch
7
+
8
+ # Load Whisper model
9
+ whisper_model = whisper.load_model("base")
10
+
11
+ GROQ_API_KEY="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"
12
+ client=Groq(api_key=GROQ_API_KEY)
13
+
14
+ # Load Stable Diffusion pipeline
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ stable_diffusion_model = StableDiffusionPipeline.from_pretrained(
18
+ "runwayml/stable-diffusion-v1-5"
19
+ ).to(device)
20
+
21
+ # Function to handle voice-to-image pipeline
22
+ def voice_to_image(audio):
23
+ # Step 1: Transcribe audio to text using Whisper
24
+ transcription = whisper_model.transcribe(audio)
25
+ input_text = transcription["text"]
26
+
27
+ # Step 2: Query LLM using Groq API
28
+ chat_completion = client.chat.completions.create(
29
+ messages=[
30
+ {"role": "user", "content": input_text},
31
+ ],
32
+ model="llama3-8b-8192",
33
+ stream=False,
34
+ )
35
+ response_text = chat_completion.choices[0].message.content
36
+
37
+ # Step 3: Generate image using Stable Diffusion
38
+ image = stable_diffusion_model(response_text).images[0]
39
+
40
+ return image
41
+
42
+ # Gradio Interface
43
+ interface = gr.Interface(
44
+ fn=voice_to_image,
45
+ inputs=gr.Audio(type="filepath"),
46
+ outputs="image",
47
+ title="Real-Time Voice-to-Image Generator",
48
+ description="Transcribe voice input into an image using Whisper, Groq LLM, and Stable Diffusion."
49
+ )
50
+
51
+ # Launch Gradio app
52
+ interface.launch()