Cahlil commited on
Commit
d78cd77
·
1 Parent(s): edd3cca

big changes to app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -17
app.py CHANGED
@@ -4,33 +4,47 @@ from transformers import pipeline
4
 
5
  asr = pipeline(
6
  "automatic-speech-recognition",
7
- model="facebook/s2t-wav2vec2-large-en-de",
8
- feature_extractor="facebook/s2t-wav2vec2-large-en-de",
 
9
  )
10
-
11
- def speech_to_text(audio):
12
- translation = asr(audio)
13
- return translation
14
 
15
  def diarization(audio):
16
- pipeline = Pipeline.from_pretrained("pyannote/speaker-segmentation")
17
- output = pipeline(audio)
18
- result = ""
19
- for turn, _, speaker in output.itertracks(yield_label=True):
20
- text_result = speech_to_text(audio)
21
- result += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,text_result,turn.start,turn.end)
22
- return "No output" if result == "" else result
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  title = "Speech Recognition with Speaker Diarization"
25
  description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
26
  article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
 
 
 
27
 
28
  app = gr.Interface(fn=diarization,
29
- inputs=gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:"),
30
- outputs=gr.outputs.Textbox(type="auto", label="OUTPUT"),
31
- examples=[["test_audio1.wav"]],
32
  title=title,
33
  description=description,
34
  article=article,
35
  allow_flagging=False)
36
- app.launch(enable_queue=True)
 
4
 
5
  asr = pipeline(
6
  "automatic-speech-recognition",
7
+ model="facebook/wav2vec2-large-960h-lv60-self",
8
+ feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
9
+
10
  )
11
+ speaker_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization")
 
 
 
12
 
13
  def diarization(audio):
14
+ speaker_output = speaker_diarization(audio)
15
+ text_output = asr(audio,return_timestamps="word")
16
+
17
+ full_text = text_output['text'].lower()
18
+ chunks = text_output['chunks']
19
+
20
+ diarized_output = ""
21
+ i = 0
22
+ for turn, _, speaker in speaker_output.itertracks(yield_label=True):
23
+ diarized = ""
24
+ while i < len(chunks):
25
+ time_index = chunks[i]['timestamp'][1]
26
+ if time_index >= turn.start and time_index <= turn.end:
27
+ diarized += chunks[i]['text'].lower() + ' '
28
+ if time_index >= turn.end: break
29
+ i += 1
30
+
31
+ diarized_output += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,diarized,turn.start,turn.end)
32
+
33
+ return diarized_output, full_text
34
 
35
  title = "Speech Recognition with Speaker Diarization"
36
  description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
37
  article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
38
+ inputs = gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:")
39
+ outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
40
+ examples = [["test_audio1.wav"]]
41
 
42
  app = gr.Interface(fn=diarization,
43
+ inputs=inputs,
44
+ outputs=outputs,
45
+ examples=examples,
46
  title=title,
47
  description=description,
48
  article=article,
49
  allow_flagging=False)
50
+ app.launch()