salmanmapkar commited on
Commit
f2e18d1
1 Parent(s): 99dd271

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -4
app.py CHANGED
@@ -13,6 +13,22 @@ import json
13
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
14
  from pydub.effects import speedup
15
  import moviepy.editor as mp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  __FILES = set()
@@ -131,6 +147,86 @@ def Transcribe(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
131
  RemoveAllFiles()
132
  return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
135
  if retries:
136
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
@@ -141,7 +237,7 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
141
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
142
  if not (os.path.isfile("temp_audio.wav")):
143
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
144
- return Transcribe(NumberOfSpeakers, SpeakerNames)
145
  else:
146
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
147
 
@@ -157,10 +253,10 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
157
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
158
  if not (os.path.isfile("temp_audio.wav")):
159
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
160
- return Transcribe(NumberOfSpeakers, SpeakerNames)
161
  else:
162
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
163
- return Transcribe(NumberOfSpeakers, SpeakerNames)
164
 
165
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
166
  if retries:
@@ -184,7 +280,7 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
184
  stream = ffmpeg.input('temp_audio.m4a')
185
  stream = ffmpeg.output(stream, 'temp_audio.wav')
186
  RemoveFile("temp_audio.m4a")
187
- return Transcribe(NumberOfSpeakers, SpeakerNames)
188
  else:
189
  raise gr.Error(f"Unable to get video from {URL}")
190
 
 
13
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
14
  from pydub.effects import speedup
15
  import moviepy.editor as mp
16
+ import datetime
17
+ import torch
18
+ import pyannote.audio
19
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
20
+ from pyannote.audio import Audio
21
+ from pyannote.core import Segment
22
+ import wave
23
+ import contextlib
24
+ from sklearn.cluster import AgglomerativeClustering
25
+ import numpy as np
26
+
27
+ model = whisper.load_model("medium")
28
+ embedding_model = PretrainedSpeakerEmbedding(
29
+ "speechbrain/spkrec-ecapa-voxceleb",
30
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
+ )
32
 
33
 
34
  __FILES = set()
 
147
  RemoveAllFiles()
148
  return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
149
 
150
+
151
+ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
152
+ SPEAKER_DICT = {}
153
+ SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',')]
154
+ def GetSpeaker(sp):
155
+ speaker = sp
156
+ if sp not in list(SPEAKER_DICT.keys()):
157
+ if len(SPEAKERS):
158
+ t = SPEAKERS.pop(0)
159
+ SPEAKER_DICT[sp] = t
160
+ speaker = SPEAKER_DICT[sp]
161
+ else:
162
+ speaker = SPEAKER_DICT[sp]
163
+ return speaker
164
+
165
+ # audio = Audio()
166
+ def get_output(segments):
167
+ # print(segments)
168
+ conversation=[]
169
+ for (i, segment) in enumerate(segments):
170
+ # print(f"{i}, {segment["speaker"]}, {segments[i - 1]["speaker"]}, {}")
171
+ if not len(conversation):
172
+ conversation.append([GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
173
+ elif conversation[-1][0] == GetSpeaker(segment["speaker"]):
174
+ conversation[-1][1] += segment["text"].lstrip()
175
+ else:
176
+ conversation.append([GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
177
+ # if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
178
+ # if i != 0:
179
+ # conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
180
+ # conversation[-1][1] += segment["text"][1:]
181
+ # return output
182
+ return ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation])), conversation
183
+
184
+ def get_duration(path):
185
+ with contextlib.closing(wave.open(path,'r')) as f:
186
+ frames = f.getnframes()
187
+ rate = f.getframerate()
188
+ return frames / float(rate)
189
+
190
+ def make_embeddings(path, segments, duration):
191
+ embeddings = np.zeros(shape=(len(segments), 192))
192
+ for i, segment in enumerate(segments):
193
+ embeddings[i] = segment_embedding(path, segment, duration)
194
+ return np.nan_to_num(embeddings)
195
+
196
+ def segment_embedding(path, segment, duration):
197
+ start = segment["start"]
198
+ # Whisper overshoots the end timestamp in the last segment
199
+ end = min(duration, segment["end"])
200
+ clip = Segment(start, end)
201
+ waveform, sample_rate = Audio().crop(path, clip)
202
+ return embedding_model(waveform[None])
203
+
204
+ def add_speaker_labels(segments, embeddings, num_speakers):
205
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
206
+ labels = clustering.labels_
207
+ for i in range(len(segments)):
208
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
209
+
210
+ def time(secs):
211
+ return datetime.timedelta(seconds=round(secs))
212
+
213
+ duration = get_duration(audio)
214
+ if duration > 4 * 60 * 60:
215
+ return "Audio duration too long"
216
+
217
+ result = model.transcribe(audio)
218
+
219
+ segments = result["segments"]
220
+
221
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
222
+ if len(segments) == 1:
223
+ segments[0]['speaker'] = 'SPEAKER 1'
224
+ else:
225
+ embeddings = make_embeddings(audio, segments, duration)
226
+ add_speaker_labels(segments, embeddings, num_speakers)
227
+ return get_output(segments)
228
+ # return output
229
+
230
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
231
  if retries:
232
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
 
237
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
238
  if not (os.path.isfile("temp_audio.wav")):
239
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
240
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
241
  else:
242
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
243
 
 
253
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
254
  if not (os.path.isfile("temp_audio.wav")):
255
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
256
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
257
  else:
258
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
259
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
260
 
261
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
262
  if retries:
 
280
  stream = ffmpeg.input('temp_audio.m4a')
281
  stream = ffmpeg.output(stream, 'temp_audio.wav')
282
  RemoveFile("temp_audio.m4a")
283
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
284
  else:
285
  raise gr.Error(f"Unable to get video from {URL}")
286