big changes to app.py
Browse files
app.py
CHANGED
@@ -4,33 +4,47 @@ from transformers import pipeline
|
|
4 |
|
5 |
asr = pipeline(
|
6 |
"automatic-speech-recognition",
|
7 |
-
model="facebook/
|
8 |
-
feature_extractor="facebook/
|
|
|
9 |
)
|
10 |
-
|
11 |
-
def speech_to_text(audio):
|
12 |
-
translation = asr(audio)
|
13 |
-
return translation
|
14 |
|
15 |
def diarization(audio):
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
title = "Speech Recognition with Speaker Diarization"
|
25 |
description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
|
26 |
article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
|
|
|
|
|
|
|
27 |
|
28 |
app = gr.Interface(fn=diarization,
|
29 |
-
inputs=
|
30 |
-
outputs=
|
31 |
-
examples=
|
32 |
title=title,
|
33 |
description=description,
|
34 |
article=article,
|
35 |
allow_flagging=False)
|
36 |
-
app.launch(
|
|
|
4 |
|
5 |
asr = pipeline(
|
6 |
"automatic-speech-recognition",
|
7 |
+
model="facebook/wav2vec2-large-960h-lv60-self",
|
8 |
+
feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
|
9 |
+
|
10 |
)
|
11 |
+
speaker_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization")
|
|
|
|
|
|
|
12 |
|
13 |
def diarization(audio):
|
14 |
+
speaker_output = speaker_diarization(audio)
|
15 |
+
text_output = asr(audio,return_timestamps="word")
|
16 |
+
|
17 |
+
full_text = text_output['text'].lower()
|
18 |
+
chunks = text_output['chunks']
|
19 |
+
|
20 |
+
diarized_output = ""
|
21 |
+
i = 0
|
22 |
+
for turn, _, speaker in speaker_output.itertracks(yield_label=True):
|
23 |
+
diarized = ""
|
24 |
+
while i < len(chunks):
|
25 |
+
time_index = chunks[i]['timestamp'][1]
|
26 |
+
if time_index >= turn.start and time_index <= turn.end:
|
27 |
+
diarized += chunks[i]['text'].lower() + ' '
|
28 |
+
if time_index >= turn.end: break
|
29 |
+
i += 1
|
30 |
+
|
31 |
+
diarized_output += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,diarized,turn.start,turn.end)
|
32 |
+
|
33 |
+
return diarized_output, full_text
|
34 |
|
35 |
title = "Speech Recognition with Speaker Diarization"
|
36 |
description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
|
37 |
article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
|
38 |
+
inputs = gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:")
|
39 |
+
outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
|
40 |
+
examples = [["test_audio1.wav"]]
|
41 |
|
42 |
app = gr.Interface(fn=diarization,
|
43 |
+
inputs=inputs,
|
44 |
+
outputs=outputs,
|
45 |
+
examples=examples,
|
46 |
title=title,
|
47 |
description=description,
|
48 |
article=article,
|
49 |
allow_flagging=False)
|
50 |
+
app.launch()
|