Cahlil commited on
Commit
f59034e
·
1 Parent(s): 60baa43

update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -25
app.py CHANGED
@@ -8,15 +8,10 @@ asr = pipeline(
8
  feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
9
 
10
  )
11
- pipeline1 = Pipeline.from_pretrained("pyannote/speaker-segmentation")
12
 
13
- def diarization(file_input,mic_input,selection):
14
- mic_path = None if mic_input is None else mic_input.name
15
- audio = file_input if selection == "Upload" else mic_path
16
- if audio is None:
17
- return "Please check your inputs!", ""
18
-
19
- speaker_output = pipeline1(audio)
20
  text_output = asr(audio,return_timestamps="word")
21
 
22
  full_text = text_output['text'].lower()
@@ -26,29 +21,31 @@ def diarization(file_input,mic_input,selection):
26
  i = 0
27
  for turn, _, speaker in speaker_output.itertracks(yield_label=True):
28
  diarized = ""
29
- while i < len(chunks):
30
- time_index = chunks[i]['timestamp'][1]
31
- if time_index >= turn.start and time_index <= turn.end:
32
- diarized += chunks[i]['text'].lower() + ' '
33
- if time_index >= turn.end: break
34
  i += 1
35
-
36
- diarized_output += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,diarized,turn.start,turn.end)
 
37
 
38
  return diarized_output, full_text
39
 
40
- title = "Speech Recognition with Speaker Diarization"
41
- description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
42
- article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
43
- inputs = [gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:", optional=True),
44
- gr.inputs.Audio(source="microphone", type="file",label="Or use your Microphone:", optional=True),
45
- gr.inputs.Radio(["Upload","Microphone"], type="value", label="Select which input:")]
 
46
  outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),
47
- gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
48
- examples = [["test_audio1.wav",None,"Upload"],
49
- ["test_audio2.wav",None,"Upload"]]
 
 
 
50
 
51
- app = gr.Interface(fn=diarization,
52
  inputs=inputs,
53
  outputs=outputs,
54
  examples=examples,
 
8
  feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
9
 
10
  )
11
+ speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-segmentation")
12
 
13
+ def segmentation(audio):
14
+ speaker_output = speaker_segmentation(audio)
 
 
 
 
 
15
  text_output = asr(audio,return_timestamps="word")
16
 
17
  full_text = text_output['text'].lower()
 
21
  i = 0
22
  for turn, _, speaker in speaker_output.itertracks(yield_label=True):
23
  diarized = ""
24
+ while i < len(chunks) and chunks[i]['timestamp'][1] <= turn.end:
25
+ diarized += chunks[i]['text'].lower() + ' '
 
 
 
26
  i += 1
27
+
28
+ if diarized != "":
29
+ diarized_output += "{}: ''{}'' from {:.3f}-{:.3f}\n".format(speaker,diarized,turn.start,turn.end)
30
 
31
  return diarized_output, full_text
32
 
33
+ title = "Speech Recognition with Speaker Segmentation"
34
+ description = "Speaker Diarization is the act of attributing individual speakers to their corresponding parts in an audio recording. This space aims to distinguish the speakers with speaker segmentation and their speech with speech-to-text from a given input audio file. Pre-trained models used are Pyannote[1] for the Speaker Segmentation and Wav2Vec2[2] for the Automatic Speech Recognition."
35
+ article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Segmentation model (GitHub repo)</a></p>"
36
+ article += "<p style='text-align: center'><a href='https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#wav2vec-20' target='_blank'>[2] Facebook Wav2Vec2 (GitHub repo)</a></p>"
37
+ article += "<p style='text-align: center'>Audio File Sources: <a href='https://www.youtube.com/watch?v=DYu_bGbZiiQ&t=132s' target='_blank'>1</a> <a href='https://www.youtube.com/watch?v=DDjWTWHHkpk&t=29s' target='_blank'>2</a> <a href='https://www.youtube.com/watch?v=G2xWg2ckKHI&t=24s' target='_blank'>3</a> <a href='https://www.youtube.com/watch?v=sCcv9uqSBU0&t=32s' target='_blank'>4</a> <a href='https://www.youtube.com/watch?v=K1hlp0DCE_8&t=71s' target='_blank'>5</a></p>"
38
+
39
+ inputs = gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:")
40
  outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),
41
+ gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
42
+ examples = [["meeting_audio.wav"],
43
+ ["noisy_london_interview.wav"],
44
+ ["clean_london_interview.wav"],
45
+ ["podcast_audio.wav"],
46
+ ["air_traffic_control_audio.wav"],]
47
 
48
+ app = gr.Interface(fn=segmentation,
49
  inputs=inputs,
50
  outputs=outputs,
51
  examples=examples,