Spaces:
Running
Running
nguyenbh
commited on
Commit
·
8591bfe
1
Parent(s):
ee88872
Init
Browse files- app.py +399 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
import requests
|
4 |
+
import os
|
5 |
+
import urllib.request
|
6 |
+
import ssl
|
7 |
+
import base64
|
8 |
+
import soundfile as sf
|
9 |
+
from io import BytesIO
|
10 |
+
import tempfile
|
11 |
+
from datetime import datetime
|
12 |
+
import logging
|
13 |
+
|
14 |
+
# Set up logging
|
15 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
class AzureSpeechTranslatorApp:
|
19 |
+
def __init__(self):
|
20 |
+
# Azure ML endpoint configuration
|
21 |
+
self.url = os.getenv("AZURE_ENDPOINT")
|
22 |
+
self.api_key = os.getenv("AZURE_API_KEY")
|
23 |
+
|
24 |
+
# Define supported languages with their codes and native names
|
25 |
+
self.languages = {
|
26 |
+
"English": {
|
27 |
+
"code": "en",
|
28 |
+
"native": "English"
|
29 |
+
},
|
30 |
+
"Chinese": {
|
31 |
+
"code": "zh",
|
32 |
+
"native": "中文"
|
33 |
+
},
|
34 |
+
"German": {
|
35 |
+
"code": "de",
|
36 |
+
"native": "Deutsch"
|
37 |
+
},
|
38 |
+
"French": {
|
39 |
+
"code": "fr",
|
40 |
+
"native": "Français"
|
41 |
+
},
|
42 |
+
"Italian": {
|
43 |
+
"code": "it",
|
44 |
+
"native": "Italiano"
|
45 |
+
},
|
46 |
+
"Japanese": {
|
47 |
+
"code": "ja",
|
48 |
+
"native": "日本語"
|
49 |
+
},
|
50 |
+
"Spanish": {
|
51 |
+
"code": "es",
|
52 |
+
"native": "Español"
|
53 |
+
},
|
54 |
+
"Portuguese": {
|
55 |
+
"code": "pt",
|
56 |
+
"native": "Português"
|
57 |
+
}
|
58 |
+
}
|
59 |
+
|
60 |
+
# Initialize storage
|
61 |
+
self.translations_dir = "translations"
|
62 |
+
os.makedirs(self.translations_dir, exist_ok=True)
|
63 |
+
self.translations = self.load_translations()
|
64 |
+
|
65 |
+
def get_translation_file_path(self, lang_code):
|
66 |
+
"""Get path for language-specific translation file"""
|
67 |
+
return os.path.join(self.translations_dir, f"translations_{lang_code}.json")
|
68 |
+
|
69 |
+
def load_translations(self):
|
70 |
+
"""Load translations for all languages"""
|
71 |
+
translations = {}
|
72 |
+
for lang_info in self.languages.values():
|
73 |
+
file_path = self.get_translation_file_path(lang_info["code"])
|
74 |
+
if os.path.exists(file_path):
|
75 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
76 |
+
translations[lang_info["code"]] = json.load(f)
|
77 |
+
else:
|
78 |
+
translations[lang_info["code"]] = []
|
79 |
+
return translations
|
80 |
+
|
81 |
+
def save_translation(self, lang_code, translation):
|
82 |
+
"""Save translation for specific language"""
|
83 |
+
file_path = self.get_translation_file_path(lang_code)
|
84 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
85 |
+
json.dump(translation, f, ensure_ascii=False, indent=2)
|
86 |
+
|
87 |
+
def call_azure_endpoint(self, payload):
|
88 |
+
"""Call Azure ML endpoint with the given payload."""
|
89 |
+
# Allow self-signed HTTPS certificates
|
90 |
+
def allow_self_signed_https(allowed):
|
91 |
+
if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
|
92 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
93 |
+
|
94 |
+
allow_self_signed_https(True)
|
95 |
+
|
96 |
+
# Set parameters
|
97 |
+
parameters = {"temperature": 0.7}
|
98 |
+
if "parameters" not in payload["input_data"]:
|
99 |
+
payload["input_data"]["parameters"] = parameters
|
100 |
+
|
101 |
+
# Encode the request body
|
102 |
+
body = str.encode(json.dumps(payload))
|
103 |
+
|
104 |
+
if not self.api_key:
|
105 |
+
raise Exception("A key should be provided to invoke the endpoint")
|
106 |
+
|
107 |
+
# Set up headers
|
108 |
+
headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + self.api_key)}
|
109 |
+
|
110 |
+
# Create and send the request
|
111 |
+
req = urllib.request.Request(self.url, body, headers)
|
112 |
+
|
113 |
+
try:
|
114 |
+
logger.info(f"Sending request to {self.url}")
|
115 |
+
response = urllib.request.urlopen(req)
|
116 |
+
result = response.read().decode('utf-8')
|
117 |
+
logger.info("Received response successfully")
|
118 |
+
return json.loads(result)
|
119 |
+
except urllib.error.HTTPError as error:
|
120 |
+
logger.error(f"Request failed with status code: {error.code}")
|
121 |
+
logger.error(f"Headers: {error.info()}")
|
122 |
+
error_message = error.read().decode("utf8", 'ignore')
|
123 |
+
logger.error(f"Error message: {error_message}")
|
124 |
+
return {"error": error_message}
|
125 |
+
|
126 |
+
def encode_audio_base64(self, audio_path):
|
127 |
+
"""Encode audio file to base64 and determine MIME type"""
|
128 |
+
file_extension = os.path.splitext(audio_path)[1].lower()
|
129 |
+
|
130 |
+
# Map file extensions to MIME types
|
131 |
+
if file_extension == '.flac':
|
132 |
+
mime_type = "audio/flac"
|
133 |
+
elif file_extension == '.wav':
|
134 |
+
mime_type = "audio/wav"
|
135 |
+
elif file_extension == '.mp3':
|
136 |
+
mime_type = "audio/mpeg"
|
137 |
+
elif file_extension in ['.m4a', '.aac']:
|
138 |
+
mime_type = "audio/aac"
|
139 |
+
elif file_extension == '.ogg':
|
140 |
+
mime_type = "audio/ogg"
|
141 |
+
else:
|
142 |
+
mime_type = "audio/wav" # Default to WAV
|
143 |
+
|
144 |
+
# Read and encode file content
|
145 |
+
with open(audio_path, "rb") as file:
|
146 |
+
encoded_string = base64.b64encode(file.read()).decode('utf-8')
|
147 |
+
|
148 |
+
return encoded_string, mime_type
|
149 |
+
|
150 |
+
def transcribe_audio(self, audio_input, source_lang="English"):
|
151 |
+
"""Transcribe audio to text using Azure endpoint"""
|
152 |
+
try:
|
153 |
+
# Encode audio to base64
|
154 |
+
base64_audio, mime_type = self.encode_audio_base64(audio_input)
|
155 |
+
|
156 |
+
# Create input content for Azure ML
|
157 |
+
content_items = [
|
158 |
+
{
|
159 |
+
"type": "text",
|
160 |
+
"text": f"Transcribe this {source_lang} audio to text."
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"type": "audio_url",
|
164 |
+
"audio_url": {
|
165 |
+
"url": f"data:{mime_type};base64,{base64_audio}"
|
166 |
+
}
|
167 |
+
}
|
168 |
+
]
|
169 |
+
|
170 |
+
# Create conversation state for Azure ML
|
171 |
+
conversation_state = [
|
172 |
+
{
|
173 |
+
"role": "user",
|
174 |
+
"content": content_items
|
175 |
+
}
|
176 |
+
]
|
177 |
+
|
178 |
+
# Create the payload
|
179 |
+
payload = {
|
180 |
+
"input_data": {
|
181 |
+
"input_string": conversation_state
|
182 |
+
}
|
183 |
+
}
|
184 |
+
|
185 |
+
# Call Azure ML endpoint
|
186 |
+
response = self.call_azure_endpoint(payload)
|
187 |
+
|
188 |
+
# Extract text response
|
189 |
+
try:
|
190 |
+
if isinstance(response, dict):
|
191 |
+
if "result" in response:
|
192 |
+
result = response["result"]
|
193 |
+
elif "output" in response:
|
194 |
+
if isinstance(response["output"], list) and len(response["output"]) > 0:
|
195 |
+
result = response["output"][0]
|
196 |
+
else:
|
197 |
+
result = str(response["output"])
|
198 |
+
elif "error" in response:
|
199 |
+
result = f"Error: {response['error']}"
|
200 |
+
else:
|
201 |
+
result = f"Unexpected response format: {json.dumps(response)}"
|
202 |
+
else:
|
203 |
+
result = str(response)
|
204 |
+
except Exception as e:
|
205 |
+
result = f"Error processing response: {str(e)}"
|
206 |
+
|
207 |
+
return result.strip()
|
208 |
+
except Exception as e:
|
209 |
+
logger.error(f"Error in transcription: {str(e)}")
|
210 |
+
return f"Transcription failed: {str(e)}"
|
211 |
+
|
212 |
+
def translate_text(self, text, source_lang, target_lang):
|
213 |
+
"""Translate text between languages using Azure endpoint"""
|
214 |
+
if not text:
|
215 |
+
return "No text to translate"
|
216 |
+
|
217 |
+
try:
|
218 |
+
# Create input content for Azure ML
|
219 |
+
content_items = [
|
220 |
+
{
|
221 |
+
"type": "text",
|
222 |
+
"text": f"Translate the following {source_lang} text to {target_lang}. Provide only the translation without any additional text or explanation:\n\n{text}"
|
223 |
+
}
|
224 |
+
]
|
225 |
+
|
226 |
+
# Create conversation state for Azure ML
|
227 |
+
conversation_state = [
|
228 |
+
{
|
229 |
+
"role": "system",
|
230 |
+
"content": [{"type": "text", "text": "You are a professional translator."}]
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"role": "user",
|
234 |
+
"content": content_items
|
235 |
+
}
|
236 |
+
]
|
237 |
+
|
238 |
+
# Create the payload
|
239 |
+
payload = {
|
240 |
+
"input_data": {
|
241 |
+
"input_string": conversation_state
|
242 |
+
}
|
243 |
+
}
|
244 |
+
|
245 |
+
# Call Azure ML endpoint
|
246 |
+
response = self.call_azure_endpoint(payload)
|
247 |
+
|
248 |
+
# Extract text response
|
249 |
+
try:
|
250 |
+
if isinstance(response, dict):
|
251 |
+
if "result" in response:
|
252 |
+
result = response["result"]
|
253 |
+
elif "output" in response:
|
254 |
+
if isinstance(response["output"], list) and len(response["output"]) > 0:
|
255 |
+
result = response["output"][0]
|
256 |
+
else:
|
257 |
+
result = str(response["output"])
|
258 |
+
elif "error" in response:
|
259 |
+
result = f"Error: {response['error']}"
|
260 |
+
else:
|
261 |
+
result = f"Unexpected response format: {json.dumps(response)}"
|
262 |
+
else:
|
263 |
+
result = str(response)
|
264 |
+
except Exception as e:
|
265 |
+
result = f"Error processing response: {str(e)}"
|
266 |
+
|
267 |
+
return result.strip()
|
268 |
+
except Exception as e:
|
269 |
+
logger.error(f"Error in translation: {str(e)}")
|
270 |
+
return f"Translation failed: {str(e)}"
|
271 |
+
|
272 |
+
def process_translation(self, audio, source_lang, target_lang):
|
273 |
+
"""Process audio input and generate translation"""
|
274 |
+
if not audio:
|
275 |
+
return "Please provide an audio file to translate."
|
276 |
+
|
277 |
+
# Transcribe audio to text
|
278 |
+
source_text = self.transcribe_audio(audio, source_lang)
|
279 |
+
|
280 |
+
# Translate to target language
|
281 |
+
translation = self.translate_text(source_text, source_lang, target_lang)
|
282 |
+
|
283 |
+
# Create translation entry
|
284 |
+
translation_entry = {
|
285 |
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
286 |
+
"source_language": source_lang,
|
287 |
+
"target_language": target_lang,
|
288 |
+
"source_text": source_text,
|
289 |
+
"translated_text": translation
|
290 |
+
}
|
291 |
+
|
292 |
+
# Save translation
|
293 |
+
source_code = self.languages[source_lang]["code"]
|
294 |
+
target_code = self.languages[target_lang]["code"]
|
295 |
+
|
296 |
+
if source_code not in self.translations:
|
297 |
+
self.translations[source_code] = []
|
298 |
+
if target_code not in self.translations:
|
299 |
+
self.translations[target_code] = []
|
300 |
+
|
301 |
+
self.translations[source_code].append(translation_entry)
|
302 |
+
self.translations[target_code].append(translation_entry)
|
303 |
+
|
304 |
+
self.save_translation(source_code, self.translations[source_code])
|
305 |
+
self.save_translation(target_code, self.translations[target_code])
|
306 |
+
|
307 |
+
return self.format_translation_display(translation_entry)
|
308 |
+
|
309 |
+
def format_translation_display(self, entry):
|
310 |
+
"""Format translation for display"""
|
311 |
+
output = f"""Timestamp: {entry['timestamp']}\n\n"""
|
312 |
+
output += f"""Source Language ({entry['source_language']}):\n{entry['source_text']}\n\n"""
|
313 |
+
output += f"""Target Language ({entry['target_language']}):\n{entry['translated_text']}\n"""
|
314 |
+
return output
|
315 |
+
|
316 |
+
def list_translations(self, lang_code):
|
317 |
+
"""List translations for specific language"""
|
318 |
+
if lang_code not in self.translations or not self.translations[lang_code]:
|
319 |
+
return "No translations found"
|
320 |
+
|
321 |
+
return "\n\n---\n\n".join([
|
322 |
+
self.format_translation_display(entry)
|
323 |
+
for entry in self.translations[lang_code]
|
324 |
+
])
|
325 |
+
|
326 |
+
def create_interface(self):
|
327 |
+
"""Create Gradio interface"""
|
328 |
+
with gr.Blocks(theme=gr.themes.Soft()) as interface:
|
329 |
+
gr.Markdown("# Phine Speech Translator with Phi-4-Multimodal")
|
330 |
+
gr.Markdown("Record speech or upload audio file for translation between multiple languages using [Phi-4-Multimodal](https://aka.ms/phi-4-multimodal/azure)")
|
331 |
+
|
332 |
+
with gr.Row():
|
333 |
+
source_lang = gr.Dropdown(
|
334 |
+
choices=list(self.languages.keys()),
|
335 |
+
value="English",
|
336 |
+
label="Source Language"
|
337 |
+
)
|
338 |
+
target_lang = gr.Dropdown(
|
339 |
+
choices=list(self.languages.keys()),
|
340 |
+
value="Chinese",
|
341 |
+
label="Target Language"
|
342 |
+
)
|
343 |
+
|
344 |
+
with gr.Row():
|
345 |
+
audio_input = gr.Audio(
|
346 |
+
sources=["microphone", "upload"],
|
347 |
+
type="filepath",
|
348 |
+
label="Record or Upload Audio"
|
349 |
+
)
|
350 |
+
|
351 |
+
with gr.Row():
|
352 |
+
translate_btn = gr.Button("Translate")
|
353 |
+
|
354 |
+
with gr.Row():
|
355 |
+
output = gr.Textbox(
|
356 |
+
label="Translation Results",
|
357 |
+
lines=10
|
358 |
+
)
|
359 |
+
|
360 |
+
# History viewer
|
361 |
+
with gr.Accordion("Translation History", open=False):
|
362 |
+
lang_select = gr.Dropdown(
|
363 |
+
choices=list(self.languages.keys()),
|
364 |
+
value="English",
|
365 |
+
label="Select Language"
|
366 |
+
)
|
367 |
+
history_output = gr.Textbox(
|
368 |
+
label="Translation History",
|
369 |
+
lines=20
|
370 |
+
)
|
371 |
+
|
372 |
+
# Event handlers
|
373 |
+
translate_btn.click(
|
374 |
+
fn=self.process_translation,
|
375 |
+
inputs=[audio_input, source_lang, target_lang],
|
376 |
+
outputs=output
|
377 |
+
)
|
378 |
+
|
379 |
+
lang_select.change(
|
380 |
+
fn=lambda x: self.list_translations(self.languages[x]["code"]),
|
381 |
+
inputs=[lang_select],
|
382 |
+
outputs=history_output
|
383 |
+
)
|
384 |
+
|
385 |
+
return interface
|
386 |
+
|
387 |
+
def run_app():
|
388 |
+
# Create app instance
|
389 |
+
app = AzureSpeechTranslatorApp()
|
390 |
+
|
391 |
+
# Launch Gradio interface
|
392 |
+
interface = app.create_interface()
|
393 |
+
interface.launch(
|
394 |
+
share=True,
|
395 |
+
server_name="0.0.0.0"
|
396 |
+
)
|
397 |
+
|
398 |
+
if __name__ == "__main__":
|
399 |
+
run_app()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
azure-ai-inference==1.0.0b9
|
2 |
+
azureml-inference-server-http==1.0.0
|
3 |
+
soundfile
|