Spaces:

Cahlil
/

Speech-Recognition-with-Speaker-Segmentation

Build error

App Files Files Community

Cahlil commited on Mar 28, 2022

Commit

d78cd77

1 Parent(s): edd3cca

big changes to app.py

Browse files

Files changed (1) hide show

app.py +31 -17

app.py CHANGED Viewed

@@ -4,33 +4,47 @@ from transformers import pipeline
 asr = pipeline(
     "automatic-speech-recognition",
-    model="facebook/s2t-wav2vec2-large-en-de",
-    feature_extractor="facebook/s2t-wav2vec2-large-en-de",
 )
-def speech_to_text(audio):
-    translation = asr(audio)
-    return translation
 def diarization(audio):
-    pipeline = Pipeline.from_pretrained("pyannote/speaker-segmentation")
-    output = pipeline(audio)
-    result = ""
-    for turn, _, speaker in output.itertracks(yield_label=True):
-        text_result = speech_to_text(audio)
-        result += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,text_result,turn.start,turn.end)
-    return "No output" if result == "" else result
 title = "Speech Recognition with Speaker Diarization"
 description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
 article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
 app = gr.Interface(fn=diarization,
-                inputs=gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:"),
-                outputs=gr.outputs.Textbox(type="auto", label="OUTPUT"),
-                examples=[["test_audio1.wav"]],
                 title=title,
                 description=description,
                 article=article,
                 allow_flagging=False)
-app.launch(enable_queue=True)

 asr = pipeline(
     "automatic-speech-recognition",
+    model="facebook/wav2vec2-large-960h-lv60-self",
+    feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
 )
+speaker_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization")
 def diarization(audio):
+    speaker_output = speaker_diarization(audio)
+    text_output = asr(audio,return_timestamps="word")
+    full_text = text_output['text'].lower()
+    chunks = text_output['chunks']
+    diarized_output = ""
+    i = 0
+    for turn, _, speaker in speaker_output.itertracks(yield_label=True):
+        diarized = ""
+        while i < len(chunks):
+            time_index = chunks[i]['timestamp'][1]
+            if time_index >= turn.start and time_index <= turn.end:
+                diarized += chunks[i]['text'].lower() + ' '
+            if time_index >= turn.end: break
+            i += 1
+        diarized_output += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,diarized,turn.start,turn.end)
+    return diarized_output, full_text
 title = "Speech Recognition with Speaker Diarization"
 description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
 article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
+inputs = gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:")
+outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
+examples = [["test_audio1.wav"]]
 app = gr.Interface(fn=diarization,
+                inputs=inputs,
+                outputs=outputs,
+                examples=examples,
                 title=title,
                 description=description,
                 article=article,
                 allow_flagging=False)
+app.launch()