nguyenbh commited on
Commit
8591bfe
·
1 Parent(s): ee88872
Files changed (2) hide show
  1. app.py +399 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import requests
4
+ import os
5
+ import urllib.request
6
+ import ssl
7
+ import base64
8
+ import soundfile as sf
9
+ from io import BytesIO
10
+ import tempfile
11
+ from datetime import datetime
12
+ import logging
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class AzureSpeechTranslatorApp:
19
+ def __init__(self):
20
+ # Azure ML endpoint configuration
21
+ self.url = os.getenv("AZURE_ENDPOINT")
22
+ self.api_key = os.getenv("AZURE_API_KEY")
23
+
24
+ # Define supported languages with their codes and native names
25
+ self.languages = {
26
+ "English": {
27
+ "code": "en",
28
+ "native": "English"
29
+ },
30
+ "Chinese": {
31
+ "code": "zh",
32
+ "native": "中文"
33
+ },
34
+ "German": {
35
+ "code": "de",
36
+ "native": "Deutsch"
37
+ },
38
+ "French": {
39
+ "code": "fr",
40
+ "native": "Français"
41
+ },
42
+ "Italian": {
43
+ "code": "it",
44
+ "native": "Italiano"
45
+ },
46
+ "Japanese": {
47
+ "code": "ja",
48
+ "native": "日本語"
49
+ },
50
+ "Spanish": {
51
+ "code": "es",
52
+ "native": "Español"
53
+ },
54
+ "Portuguese": {
55
+ "code": "pt",
56
+ "native": "Português"
57
+ }
58
+ }
59
+
60
+ # Initialize storage
61
+ self.translations_dir = "translations"
62
+ os.makedirs(self.translations_dir, exist_ok=True)
63
+ self.translations = self.load_translations()
64
+
65
+ def get_translation_file_path(self, lang_code):
66
+ """Get path for language-specific translation file"""
67
+ return os.path.join(self.translations_dir, f"translations_{lang_code}.json")
68
+
69
+ def load_translations(self):
70
+ """Load translations for all languages"""
71
+ translations = {}
72
+ for lang_info in self.languages.values():
73
+ file_path = self.get_translation_file_path(lang_info["code"])
74
+ if os.path.exists(file_path):
75
+ with open(file_path, 'r', encoding='utf-8') as f:
76
+ translations[lang_info["code"]] = json.load(f)
77
+ else:
78
+ translations[lang_info["code"]] = []
79
+ return translations
80
+
81
+ def save_translation(self, lang_code, translation):
82
+ """Save translation for specific language"""
83
+ file_path = self.get_translation_file_path(lang_code)
84
+ with open(file_path, 'w', encoding='utf-8') as f:
85
+ json.dump(translation, f, ensure_ascii=False, indent=2)
86
+
87
+ def call_azure_endpoint(self, payload):
88
+ """Call Azure ML endpoint with the given payload."""
89
+ # Allow self-signed HTTPS certificates
90
+ def allow_self_signed_https(allowed):
91
+ if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
92
+ ssl._create_default_https_context = ssl._create_unverified_context
93
+
94
+ allow_self_signed_https(True)
95
+
96
+ # Set parameters
97
+ parameters = {"temperature": 0.7}
98
+ if "parameters" not in payload["input_data"]:
99
+ payload["input_data"]["parameters"] = parameters
100
+
101
+ # Encode the request body
102
+ body = str.encode(json.dumps(payload))
103
+
104
+ if not self.api_key:
105
+ raise Exception("A key should be provided to invoke the endpoint")
106
+
107
+ # Set up headers
108
+ headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + self.api_key)}
109
+
110
+ # Create and send the request
111
+ req = urllib.request.Request(self.url, body, headers)
112
+
113
+ try:
114
+ logger.info(f"Sending request to {self.url}")
115
+ response = urllib.request.urlopen(req)
116
+ result = response.read().decode('utf-8')
117
+ logger.info("Received response successfully")
118
+ return json.loads(result)
119
+ except urllib.error.HTTPError as error:
120
+ logger.error(f"Request failed with status code: {error.code}")
121
+ logger.error(f"Headers: {error.info()}")
122
+ error_message = error.read().decode("utf8", 'ignore')
123
+ logger.error(f"Error message: {error_message}")
124
+ return {"error": error_message}
125
+
126
+ def encode_audio_base64(self, audio_path):
127
+ """Encode audio file to base64 and determine MIME type"""
128
+ file_extension = os.path.splitext(audio_path)[1].lower()
129
+
130
+ # Map file extensions to MIME types
131
+ if file_extension == '.flac':
132
+ mime_type = "audio/flac"
133
+ elif file_extension == '.wav':
134
+ mime_type = "audio/wav"
135
+ elif file_extension == '.mp3':
136
+ mime_type = "audio/mpeg"
137
+ elif file_extension in ['.m4a', '.aac']:
138
+ mime_type = "audio/aac"
139
+ elif file_extension == '.ogg':
140
+ mime_type = "audio/ogg"
141
+ else:
142
+ mime_type = "audio/wav" # Default to WAV
143
+
144
+ # Read and encode file content
145
+ with open(audio_path, "rb") as file:
146
+ encoded_string = base64.b64encode(file.read()).decode('utf-8')
147
+
148
+ return encoded_string, mime_type
149
+
150
+ def transcribe_audio(self, audio_input, source_lang="English"):
151
+ """Transcribe audio to text using Azure endpoint"""
152
+ try:
153
+ # Encode audio to base64
154
+ base64_audio, mime_type = self.encode_audio_base64(audio_input)
155
+
156
+ # Create input content for Azure ML
157
+ content_items = [
158
+ {
159
+ "type": "text",
160
+ "text": f"Transcribe this {source_lang} audio to text."
161
+ },
162
+ {
163
+ "type": "audio_url",
164
+ "audio_url": {
165
+ "url": f"data:{mime_type};base64,{base64_audio}"
166
+ }
167
+ }
168
+ ]
169
+
170
+ # Create conversation state for Azure ML
171
+ conversation_state = [
172
+ {
173
+ "role": "user",
174
+ "content": content_items
175
+ }
176
+ ]
177
+
178
+ # Create the payload
179
+ payload = {
180
+ "input_data": {
181
+ "input_string": conversation_state
182
+ }
183
+ }
184
+
185
+ # Call Azure ML endpoint
186
+ response = self.call_azure_endpoint(payload)
187
+
188
+ # Extract text response
189
+ try:
190
+ if isinstance(response, dict):
191
+ if "result" in response:
192
+ result = response["result"]
193
+ elif "output" in response:
194
+ if isinstance(response["output"], list) and len(response["output"]) > 0:
195
+ result = response["output"][0]
196
+ else:
197
+ result = str(response["output"])
198
+ elif "error" in response:
199
+ result = f"Error: {response['error']}"
200
+ else:
201
+ result = f"Unexpected response format: {json.dumps(response)}"
202
+ else:
203
+ result = str(response)
204
+ except Exception as e:
205
+ result = f"Error processing response: {str(e)}"
206
+
207
+ return result.strip()
208
+ except Exception as e:
209
+ logger.error(f"Error in transcription: {str(e)}")
210
+ return f"Transcription failed: {str(e)}"
211
+
212
+ def translate_text(self, text, source_lang, target_lang):
213
+ """Translate text between languages using Azure endpoint"""
214
+ if not text:
215
+ return "No text to translate"
216
+
217
+ try:
218
+ # Create input content for Azure ML
219
+ content_items = [
220
+ {
221
+ "type": "text",
222
+ "text": f"Translate the following {source_lang} text to {target_lang}. Provide only the translation without any additional text or explanation:\n\n{text}"
223
+ }
224
+ ]
225
+
226
+ # Create conversation state for Azure ML
227
+ conversation_state = [
228
+ {
229
+ "role": "system",
230
+ "content": [{"type": "text", "text": "You are a professional translator."}]
231
+ },
232
+ {
233
+ "role": "user",
234
+ "content": content_items
235
+ }
236
+ ]
237
+
238
+ # Create the payload
239
+ payload = {
240
+ "input_data": {
241
+ "input_string": conversation_state
242
+ }
243
+ }
244
+
245
+ # Call Azure ML endpoint
246
+ response = self.call_azure_endpoint(payload)
247
+
248
+ # Extract text response
249
+ try:
250
+ if isinstance(response, dict):
251
+ if "result" in response:
252
+ result = response["result"]
253
+ elif "output" in response:
254
+ if isinstance(response["output"], list) and len(response["output"]) > 0:
255
+ result = response["output"][0]
256
+ else:
257
+ result = str(response["output"])
258
+ elif "error" in response:
259
+ result = f"Error: {response['error']}"
260
+ else:
261
+ result = f"Unexpected response format: {json.dumps(response)}"
262
+ else:
263
+ result = str(response)
264
+ except Exception as e:
265
+ result = f"Error processing response: {str(e)}"
266
+
267
+ return result.strip()
268
+ except Exception as e:
269
+ logger.error(f"Error in translation: {str(e)}")
270
+ return f"Translation failed: {str(e)}"
271
+
272
+ def process_translation(self, audio, source_lang, target_lang):
273
+ """Process audio input and generate translation"""
274
+ if not audio:
275
+ return "Please provide an audio file to translate."
276
+
277
+ # Transcribe audio to text
278
+ source_text = self.transcribe_audio(audio, source_lang)
279
+
280
+ # Translate to target language
281
+ translation = self.translate_text(source_text, source_lang, target_lang)
282
+
283
+ # Create translation entry
284
+ translation_entry = {
285
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
286
+ "source_language": source_lang,
287
+ "target_language": target_lang,
288
+ "source_text": source_text,
289
+ "translated_text": translation
290
+ }
291
+
292
+ # Save translation
293
+ source_code = self.languages[source_lang]["code"]
294
+ target_code = self.languages[target_lang]["code"]
295
+
296
+ if source_code not in self.translations:
297
+ self.translations[source_code] = []
298
+ if target_code not in self.translations:
299
+ self.translations[target_code] = []
300
+
301
+ self.translations[source_code].append(translation_entry)
302
+ self.translations[target_code].append(translation_entry)
303
+
304
+ self.save_translation(source_code, self.translations[source_code])
305
+ self.save_translation(target_code, self.translations[target_code])
306
+
307
+ return self.format_translation_display(translation_entry)
308
+
309
+ def format_translation_display(self, entry):
310
+ """Format translation for display"""
311
+ output = f"""Timestamp: {entry['timestamp']}\n\n"""
312
+ output += f"""Source Language ({entry['source_language']}):\n{entry['source_text']}\n\n"""
313
+ output += f"""Target Language ({entry['target_language']}):\n{entry['translated_text']}\n"""
314
+ return output
315
+
316
+ def list_translations(self, lang_code):
317
+ """List translations for specific language"""
318
+ if lang_code not in self.translations or not self.translations[lang_code]:
319
+ return "No translations found"
320
+
321
+ return "\n\n---\n\n".join([
322
+ self.format_translation_display(entry)
323
+ for entry in self.translations[lang_code]
324
+ ])
325
+
326
+ def create_interface(self):
327
+ """Create Gradio interface"""
328
+ with gr.Blocks(theme=gr.themes.Soft()) as interface:
329
+ gr.Markdown("# Phine Speech Translator with Phi-4-Multimodal")
330
+ gr.Markdown("Record speech or upload audio file for translation between multiple languages using [Phi-4-Multimodal](https://aka.ms/phi-4-multimodal/azure)")
331
+
332
+ with gr.Row():
333
+ source_lang = gr.Dropdown(
334
+ choices=list(self.languages.keys()),
335
+ value="English",
336
+ label="Source Language"
337
+ )
338
+ target_lang = gr.Dropdown(
339
+ choices=list(self.languages.keys()),
340
+ value="Chinese",
341
+ label="Target Language"
342
+ )
343
+
344
+ with gr.Row():
345
+ audio_input = gr.Audio(
346
+ sources=["microphone", "upload"],
347
+ type="filepath",
348
+ label="Record or Upload Audio"
349
+ )
350
+
351
+ with gr.Row():
352
+ translate_btn = gr.Button("Translate")
353
+
354
+ with gr.Row():
355
+ output = gr.Textbox(
356
+ label="Translation Results",
357
+ lines=10
358
+ )
359
+
360
+ # History viewer
361
+ with gr.Accordion("Translation History", open=False):
362
+ lang_select = gr.Dropdown(
363
+ choices=list(self.languages.keys()),
364
+ value="English",
365
+ label="Select Language"
366
+ )
367
+ history_output = gr.Textbox(
368
+ label="Translation History",
369
+ lines=20
370
+ )
371
+
372
+ # Event handlers
373
+ translate_btn.click(
374
+ fn=self.process_translation,
375
+ inputs=[audio_input, source_lang, target_lang],
376
+ outputs=output
377
+ )
378
+
379
+ lang_select.change(
380
+ fn=lambda x: self.list_translations(self.languages[x]["code"]),
381
+ inputs=[lang_select],
382
+ outputs=history_output
383
+ )
384
+
385
+ return interface
386
+
387
+ def run_app():
388
+ # Create app instance
389
+ app = AzureSpeechTranslatorApp()
390
+
391
+ # Launch Gradio interface
392
+ interface = app.create_interface()
393
+ interface.launch(
394
+ share=True,
395
+ server_name="0.0.0.0"
396
+ )
397
+
398
+ if __name__ == "__main__":
399
+ run_app()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ azure-ai-inference==1.0.0b9
2
+ azureml-inference-server-http==1.0.0
3
+ soundfile