Spaces:

Macedonian-ASR
/

Bookie-w2v2-Macedonian-ASR

Running on Zero

App Files Files Community

Porjaz commited on Dec 25, 2024

Commit

8218447

verified ·

1 Parent(s): 19502c3

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -30

app.py CHANGED Viewed

@@ -29,18 +29,11 @@ def recap_sentence(string):
     return recap_result
-def return_prediction_w2v2(mic=None, file=None, progress=gr.Progress(), device=device):
     if mic is not None:
         download_path = mic.split(".")[0] + ".txt"
         waveform, sr = librosa.load(mic, sr=16000)
-        # waveform = waveform[:60*sr]
         w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
-    elif file is not None:
-        download_path = file.split(".")[0] + ".txt"
-        waveform, sr = librosa.load(file, sr=16000)
-        # waveform = waveform[:60*sr]
-        w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
-    else:
         return "You must either provide a mic recording or a file"
     recap_result = ""
@@ -76,33 +69,49 @@ def return_prediction_w2v2(mic=None, file=None, progress=gr.Progress(), device=d
     return recap_result, download_path
-def return_prediction_whisper(mic=None, file=None, device=device):
-    if mic is not None:
-        waveform, sr = librosa.load(mic, sr=16000)
-        waveform = waveform[:30*sr]
-        whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
-    elif file is not None:
         waveform, sr = librosa.load(file, sr=16000)
-        waveform = waveform[:30*sr]
-        whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
-    else:
         return "You must either provide a mic recording or a file"
-    recap_result = recap_sentence(whisper_result[0])
-    # If the letter after punct is small, recap it
-    for i, letter in enumerate(recap_result):
-        if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
-            recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
-    clean_up_memory()
-    return recap_result
 # Create a partial function with the device pre-applied
-return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
 # Load the ASR models
@@ -123,6 +132,32 @@ with gr.Blocks() as mic_transcribe_wav2vec2:
     def clear_outputs():
         return {audio_input: None, output_text: "", download_file: None}
     with gr.Row():
         audio_input = gr.Audio(sources="upload", type="filepath", label="Record Audio")
     with gr.Row():
@@ -131,10 +166,10 @@ with gr.Blocks() as mic_transcribe_wav2vec2:
     with gr.Row():
         output_text = gr.Textbox(label="Transcription")
     with gr.Row():
-        download_file = gr.File(label="Зачувај го транскриптот", file_count="single", height=50)
     transcribe_button.click(
-        fn=return_prediction_w2v2_with_device,
         inputs=[audio_input],
         outputs=[output_text, download_file],
     )
@@ -214,8 +249,8 @@ with transcriber_app:
     # state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
     gr.TabbedInterface(
-        [mic_transcribe_wav2vec2],
-        ["Буки-W2v2 транскрипција"],
     )
     state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))

     return recap_result
+def return_prediction_w2v2_mic(mic=None, progress=gr.Progress(), device=device):
     if mic is not None:
         download_path = mic.split(".")[0] + ".txt"
         waveform, sr = librosa.load(mic, sr=16000)
         w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
         return "You must either provide a mic recording or a file"
     recap_result = ""
     return recap_result, download_path
+def return_prediction_w2v2_file(file=None, progress=gr.Progress(), device=device):
+    if file is not None:
+        download_path = file.split(".")[0] + ".txt"
         waveform, sr = librosa.load(file, sr=16000)
+        w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
         return "You must either provide a mic recording or a file"
+    recap_result = ""
+    prev_segment = ""
+    prev_segment_len = 0
+    for k, segment in enumerate(w2v2_result):
+        progress(0.75, desc=" Пост-процесирање на транскриптот")
+        if prev_segment == "":
+            recap_segment= recap_sentence(segment)
+        else:
+            prev_segment_len = len(prev_segment.split())
+            recap_segment = recap_sentence(prev_segment + " " + segment)
+        # remove prev_segment from the beginning of the recap_result
+        recap_segment = recap_segment.split()
+        recap_segment = recap_segment[prev_segment_len:]
+        recap_segment = " ".join(recap_segment)
+        prev_segment = segment[0]
+        recap_result += recap_segment + " "
+        # If the letter after punct is small, recap it
+        for i, letter in enumerate(recap_result):
+            if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
+                recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
+        clean_up_memory()
+    progress(1.0, desc=" Крај на транскрипцијата")
+    with open(download_path, "w") as f:
+        f.write(recap_result)
+    return recap_result, download_path
 # Create a partial function with the device pre-applied
+return_prediction_w2v2_mic_with_device = partial(return_prediction_w2v2_mic, device=device)
+return_prediction_w2v2_file_with_device = partial(return_prediction_w2v2_file, device=device)
 # Load the ASR models
     def clear_outputs():
         return {audio_input: None, output_text: "", download_file: None}
+    with gr.Row():
+        audio_input = gr.Audio(sources="microphone", type="filepath", label="Record Audio")
+    with gr.Row():
+        transcribe_button = gr.Button("Transcribe")
+        clear_button = gr.Button("Clear")
+    with gr.Row():
+        output_text = gr.Textbox(label="Transcription")
+    with gr.Row():
+        download_file = gr.File(label="Зачувај го транскриптот", file_count="single")
+    transcribe_button.click(
+        fn=return_prediction_w2v2_mic_with_device,
+        inputs=[audio_input],
+        outputs=[output_text, download_file],
+    )
+    clear_button.click(
+        fn=clear_outputs,
+        inputs=[],
+        outputs=[audio_input, output_text, download_file],
+    )
+with gr.Blocks() as file_transcribe_wav2vec2:
+    def clear_outputs():
+        return {audio_input: None, output_text: "", download_file: None}
     with gr.Row():
         audio_input = gr.Audio(sources="upload", type="filepath", label="Record Audio")
     with gr.Row():
     with gr.Row():
         output_text = gr.Textbox(label="Transcription")
     with gr.Row():
+        download_file = gr.File(label="Зачувај го транскриптот", file_count="single")
     transcribe_button.click(
+        fn=return_prediction_w2v2_file_with_device,
         inputs=[audio_input],
         outputs=[output_text, download_file],
     )
     # state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
     gr.TabbedInterface(
+        [mic_transcribe_wav2vec2, file_transcribe_wav2vec2],
+        ["Буки-w2v2 транскрипција од микрофон", "Буки-w2v2 транскрипција од фајл"],
     )
     state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))