akjedidtz commited on
Commit
3651420
·
verified ·
1 Parent(s): a3b5666

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+
5
+
6
+
7
+ import speech_recognition as sr
8
+ from gtts import gTTS
9
+ from pydub import AudioSegment
10
+ from IPython.display import Audio
11
+
12
+ import torch
13
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
14
+ import soundfile as sf
15
+
16
+ # Setup device and dtype
17
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
18
+
19
+
20
+
21
+
22
+
23
+ import os
24
+ from groq import Groq
25
+
26
+ # Initialize the Groq client with the API key
27
+ client = Groq(
28
+ api_key="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",
29
+ )
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+ #@@##
42
+
43
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
44
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
45
+
46
+ # Load model and processor
47
+ model_id = "openai/whisper-medium"
48
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
49
+ model_id,
50
+ torch_dtype=torch_dtype,
51
+ low_cpu_mem_usage=True,
52
+ use_safetensors=True
53
+ )
54
+ model.to(device)
55
+ processor = AutoProcessor.from_pretrained(model_id)
56
+
57
+ from transformers import pipeline
58
+ from gtts import gTTS
59
+ import gradio as gr
60
+ import torch
61
+
62
+ # Load ASR pipeline
63
+ asr_pipe =pipeline(
64
+ "automatic-speech-recognition",
65
+ model=model,
66
+ tokenizer=processor.tokenizer,
67
+ feature_extractor=processor.feature_extractor,
68
+ torch_dtype=torch_dtype,
69
+ device=device,
70
+ )
71
+
72
+ # Initialize Groq client
73
+ client = Groq(
74
+ api_key="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"
75
+ )
76
+
77
+ # Text-to-Speech function
78
+ def text_to_speech(text):
79
+ try:
80
+ # Convert text to speech using gTTS
81
+ tts = gTTS(text, lang='hi')
82
+ tts.save("response.mp3")
83
+ return "response.mp3" # Return the MP3 file path for playback in Gradio
84
+ except Exception as e:
85
+ print(f"Text-to-speech error: {e}")
86
+ return None
87
+
88
+ # Function to process audio, get model response, and return TTS output
89
+ def process_audio(audio):
90
+ # Convert audio to text
91
+ print("Converting audio to text...")
92
+ result = asr_pipe(audio, generate_kwargs={"language": "urdu"})
93
+
94
+ # Check if audio-to-text conversion was successful
95
+ if "text" in result and result["text"].strip():
96
+ user_ques = result["text"]
97
+ print("Audio-to-text conversion successful. User Question:", user_ques)
98
+
99
+ # Prepare messages for model input
100
+ messages = [
101
+ {
102
+ "role": "system",
103
+ "content": "You are a helpful assistant named SSk BOT that stands for (sehar bot) who mostly answers in Roman Urdu. Be professional. No emojis; just Urdu written in English letters, and if you receive a prompt in Urdu font, answer only in English (Roman Urdu).",
104
+ },
105
+ {
106
+ "role": "user",
107
+ "content": user_ques,
108
+ }
109
+ ]
110
+
111
+ # Get response from Groq model
112
+ print("Getting response from the model...")
113
+ response = client.chat.completions.create(
114
+ messages=messages,
115
+ model="gemma2-9b-it",
116
+ )
117
+
118
+ # Extract model's response
119
+ model_response = response['choices'][0]['message']['content']
120
+ print("Model:", model_response)
121
+
122
+ # Convert model's response to speech
123
+ audio_path = text_to_speech(model_response)
124
+ return model_response, audio_path
125
+
126
+ else:
127
+ print("Audio-to-text conversion failed or produced no text.")
128
+ return "Audio-to-text conversion failed or no text was detected.", None
129
+
130
+ # Gradio interface
131
+ interface = gr.Interface(
132
+ fn=process_audio,
133
+ inputs=gr.Audio(type="filepath"),
134
+ outputs=[gr.Textbox(label="Model Response"), gr.Audio(label="Response Audio")],
135
+ title="Real-time ASR to Language Model Response",
136
+ description="Upload an audio file in Urdu, get a text response from the model, and hear the response in English."
137
+ )
138
+
139
+ # Launch the Gradio Interface
140
+ interface.launch()
141
+
142
+
143
+