unijoh commited on
Commit
e7e280b
·
verified ·
1 Parent(s): 1bfb695

Update asr.py

Browse files
Files changed (1) hide show
  1. asr.py +52 -101
asr.py CHANGED
@@ -1,104 +1,55 @@
1
- import gradio as gr
2
- import time
3
- from transformers import pipeline
4
  import torch
5
- import ffmpeg # Make sure it's ffmpeg-python
6
-
7
- def main():
8
- # Check if GPU is available
9
- use_gpu = torch.cuda.is_available()
10
-
11
- # Configure the pipeline to use the GPU if available
12
- if use_gpu:
13
- p = pipeline(
14
- "automatic-speech-recognition",
15
- model="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h",
16
- device=0
17
- )
18
- else:
19
- p = pipeline(
20
- "automatic-speech-recognition",
21
- model="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
22
- )
23
-
24
- def extract_audio_from_m3u8(url):
25
- try:
26
- output_file = "output_audio.aac"
27
- ffmpeg.input(url).output(output_file).run(overwrite_output=True)
28
- return output_file
29
- except Exception as e:
30
- return f"An error occurred: {e}"
31
-
32
- def transcribe_function(audio, state, uploaded_audio, m3u8_url):
33
- if m3u8_url:
34
- audio = extract_audio_from_m3u8(m3u8_url)
35
-
36
- if uploaded_audio is not None:
37
- audio = uploaded_audio
38
-
39
- if not audio:
40
- # Return a meaningful message; no audio found
41
- return {state_var: state, transcription_var: state}
42
-
43
  try:
44
- time.sleep(3)
45
- text = p(audio, chunk_length_s=50)["text"]
46
- state += text + "\n"
47
- return {state_var: state, transcription_var: state}
48
  except Exception as e:
49
- return {
50
- transcription_var: "An error occurred during transcription.",
51
- state_var: state
52
- }
53
-
54
- def reset_output(transcription, state):
55
- """Function to reset the state to an empty string."""
56
- return "", ""
57
-
58
- with gr.Blocks() as demo:
59
- state_var = gr.State("")
60
-
61
- with gr.Row():
62
- with gr.Column():
63
- microphone = gr.Audio(
64
- source="microphone",
65
- type="filepath",
66
- label="Microphone"
67
- )
68
- uploaded_audio = gr.Audio(
69
- label="Upload Audio File",
70
- type="filepath",
71
- source="upload"
72
- )
73
- m3u8_url = gr.Textbox(
74
- label="m3u8 URL | E.g.: from kvf.fo or logting.fo"
75
- )
76
-
77
- with gr.Column():
78
- transcription_var = gr.Textbox(
79
- type="text",
80
- label="Transcription",
81
- readonly=True
82
- )
83
-
84
- with gr.Row():
85
- transcribe_button = gr.Button("Transcribe")
86
- reset_button = gr.Button("Reset output")
87
-
88
- transcribe_button.click(
89
- transcribe_function,
90
- [microphone, state_var, uploaded_audio, m3u8_url],
91
- [transcription_var, state_var]
92
- )
93
-
94
- reset_button.click(
95
- reset_output,
96
- [transcription_var, state_var],
97
- [transcription_var, state_var]
98
- )
99
-
100
- # Launch with the latest Gradio features
101
- demo.launch()
102
-
103
- if __name__ == "__main__":
104
- main()
 
1
+ import librosa
2
+ from transformers import AutoProcessor, Wav2Vec2ForCTC
 
3
  import torch
4
+ import logging
5
+
6
+ # Set up logging
7
+ logging.basicConfig(level=logging.DEBUG)
8
+
9
+ ASR_SAMPLING_RATE = 16_000
10
+ MODEL_ID = "facebook/mms-1b-all"
11
+
12
+ try:
13
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
14
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
15
+ logging.info("ASR model and processor loaded successfully.")
16
+ except Exception as e:
17
+ logging.error(f"Error loading ASR model or processor: {e}")
18
+
19
+ def transcribe(audio):
20
+ try:
21
+ if audio is None:
22
+ logging.error("No audio file provided")
23
+ return "ERROR: You have to either use the microphone or upload an audio file"
24
+
25
+ logging.info(f"Loading audio file: {audio}")
26
+
27
+ # Try loading the audio file with librosa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  try:
29
+ audio_samples, _ = librosa.load(audio, sr=ASR_SAMPLING_RATE, mono=True)
30
+ except FileNotFoundError:
31
+ logging.error("Audio file not found")
32
+ return "ERROR: Audio file not found"
33
  except Exception as e:
34
+ logging.error(f"Error loading audio file with librosa: {e}")
35
+ return f"ERROR: Unable to load audio file - {e}"
36
+
37
+ # Set the language for the processor to Faroese
38
+ lang_code = "fao"
39
+ processor.tokenizer.set_target_lang(lang_code)
40
+ model.load_adapter(lang_code)
41
+
42
+ # Process the audio with the processor
43
+ inputs = processor(audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt")
44
+
45
+ with torch.no_grad():
46
+ outputs = model(**inputs).logits
47
+
48
+ ids = torch.argmax(outputs, dim=-1)[0]
49
+ transcription = processor.decode(ids)
50
+
51
+ logging.info("Transcription completed successfully.")
52
+ return transcription
53
+ except Exception as e:
54
+ logging.error(f"Error during transcription: {e}")
55
+ return "ERROR"