allandclive commited on
Commit
6bf44a2
·
1 Parent(s): 368aa5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -18
app.py CHANGED
@@ -1,41 +1,53 @@
1
  import gradio as gr
2
  import torch
3
  import librosa
 
4
  from transformers import pipeline
5
  from stitched_model import CombinedModel
6
 
7
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
8
 
9
- # Load the model
10
- model = CombinedModel("facebook/mms-1b-all", "Sunbird/sunbird-mul-en-mbart-merged", device=device)
 
 
 
 
 
 
 
11
 
12
- def transcribe(audio_file):
13
  # Load the audio file
14
  speech, sample_rate = librosa.load(audio_file, sr=16000, mono=True)
15
 
16
- # Split the audio clip into smaller chunks
17
- chunk_size = 30 * 16000
18
- chunks = librosa.util.chunk(speech, chunk_size)
19
 
20
- # Transcribe each chunk individually
21
  transcriptions = []
 
22
  for chunk in chunks:
23
- speech_tensor = torch.tensor([chunk])
24
  with torch.no_grad():
25
- transcription = model({"audio": speech_tensor})["text"]
26
  transcriptions.append(transcription)
 
27
 
28
- # Join the transcriptions together
29
- transcription = " ".join(transcriptions)
30
 
31
- return transcription
32
 
33
  description = '''Luganda to English Speech Translation'''
34
 
35
- iface = gr.Interface(
36
- fn=transcribe,
37
- inputs=[gr.Audio(source="upload", type="filepath", label="Upload Audio")],
38
- outputs=[gr.Textbox(label="Transcription")],
39
- description=description
40
- )
 
 
 
41
  iface.launch()
 
1
  import gradio as gr
2
  import torch
3
  import librosa
4
+ import json
5
  from transformers import pipeline
6
  from stitched_model import CombinedModel
7
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
+ model = CombinedModel("indonesian-nlp/wav2vec2-luganda", "Sunbird/sunbird-mul-en-mbart-merged", device=device)
11
+
12
+ def transcribe(audio_file_mic=None, audio_file_upload=None):
13
+ if audio_file_mic:
14
+ audio_file = audio_file_mic
15
+ elif audio_file_upload:
16
+ audio_file = audio_file_upload
17
+ else:
18
+ return "Please upload an audio file or record one"
19
 
 
20
  # Load the audio file
21
  speech, sample_rate = librosa.load(audio_file, sr=16000, mono=True)
22
 
23
+ # Split the audio into 30-second chunks
24
+ chunk_size = 30 * 16000
25
+ chunks = [speech[i:i + chunk_size] for i in range(0, len(speech), chunk_size)]
26
 
27
+ # Process each chunk and concatenate the results
28
  transcriptions = []
29
+ translations = []
30
  for chunk in chunks:
31
+ chunk = torch.tensor([chunk])
32
  with torch.no_grad():
33
+ transcription, translation = model({"audio": chunk})
34
  transcriptions.append(transcription)
35
+ translations.append(translation[0])
36
 
37
+ transcription = "".join(transcriptions)
38
+ translation = "".join(translations)
39
 
40
+ return transcription, translation
41
 
42
  description = '''Luganda to English Speech Translation'''
43
 
44
+ iface = gr.Interface(fn=transcribe,
45
+ inputs=[
46
+ gr.Audio(source="microphone", type="filepath", label="Record Audio"),
47
+ gr.Audio(source="upload", type="filepath", label="Upload Audio")],
48
+ outputs=[gr.Textbox(label="Transcription"),
49
+ gr.Textbox(label="Translation")
50
+ ],
51
+ description=description
52
+ )
53
  iface.launch()