Spaces:
Runtime error
Runtime error
salmanmapkar
commited on
Commit
•
2f47bf1
1
Parent(s):
25a24aa
Update app.py
Browse files
app.py
CHANGED
@@ -17,14 +17,15 @@ import moviepy.editor as mp
|
|
17 |
import datetime
|
18 |
import torch
|
19 |
import pyannote.audio
|
20 |
-
from pyannote.audio.pipelines.speaker_verification import
|
21 |
from pyannote.audio import Audio
|
22 |
from pyannote.core import Segment
|
23 |
import wave
|
24 |
import contextlib
|
25 |
from sklearn.cluster import AgglomerativeClustering
|
26 |
import numpy as np
|
27 |
-
|
|
|
28 |
|
29 |
__FILES = set()
|
30 |
|
@@ -140,7 +141,8 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
|
|
140 |
|
141 |
def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
142 |
model = whisper.load_model("medium")
|
143 |
-
embedding_model =
|
|
|
144 |
"speechbrain/spkrec-ecapa-voxceleb",
|
145 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
146 |
)
|
@@ -158,23 +160,50 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
158 |
return speaker
|
159 |
|
160 |
# audio = Audio()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
def get_output(segments):
|
162 |
# print(segments)
|
163 |
conversation=[]
|
164 |
for (i, segment) in enumerate(segments):
|
165 |
# print(f"{i}, {segment["speaker"]}, {segments[i - 1]["speaker"]}, {}")
|
166 |
if not len(conversation):
|
167 |
-
conversation.append([GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
|
168 |
-
elif conversation[-1][
|
169 |
-
conversation[-1][
|
170 |
else:
|
171 |
-
conversation.append([GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
|
172 |
# if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
|
173 |
# if i != 0:
|
174 |
# conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
|
175 |
# conversation[-1][1] += segment["text"][1:]
|
176 |
# return output
|
177 |
-
return ("".join([f"{
|
178 |
|
179 |
def get_duration(path):
|
180 |
with contextlib.closing(wave.open(path,'r')) as f:
|
@@ -209,7 +238,9 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
209 |
if duration > 4 * 60 * 60:
|
210 |
return "Audio duration too long"
|
211 |
|
|
|
212 |
result = model.transcribe(audio)
|
|
|
213 |
|
214 |
segments = result["segments"]
|
215 |
|
@@ -233,7 +264,7 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
|
|
233 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
234 |
if not (os.path.isfile("temp_audio.wav")):
|
235 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
236 |
-
return
|
237 |
else:
|
238 |
raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
|
239 |
|
@@ -249,10 +280,10 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
|
|
249 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
250 |
if not (os.path.isfile("temp_audio.wav")):
|
251 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
252 |
-
return
|
253 |
else:
|
254 |
raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
|
255 |
-
return
|
256 |
|
257 |
def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
|
258 |
if retries:
|
@@ -276,7 +307,7 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
|
|
276 |
stream = ffmpeg.input('temp_audio.m4a')
|
277 |
stream = ffmpeg.output(stream, 'temp_audio.wav')
|
278 |
RemoveFile("temp_audio.m4a")
|
279 |
-
return
|
280 |
else:
|
281 |
raise gr.Error(f"Unable to get video from {URL}")
|
282 |
|
@@ -299,27 +330,28 @@ at = gr.Interface(
|
|
299 |
# demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
|
300 |
# demo.launch()
|
301 |
with gr.Blocks() as yav_ui:
|
302 |
-
with gr.
|
303 |
-
with gr.
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
with gr.
|
320 |
-
|
321 |
-
|
322 |
-
|
|
|
323 |
ybutton_transcribe.click(
|
324 |
fn=YoutubeTranscribe,
|
325 |
inputs=[yinput_nos,yinput_sn,yinput],
|
|
|
17 |
import datetime
|
18 |
import torch
|
19 |
import pyannote.audio
|
20 |
+
from pyannote.audio.pipelines.speaker_verification import SpeechBrainPretrainedSpeakerEmbedding #PyannoteAudioPretrainedSpeakerEmbedding
|
21 |
from pyannote.audio import Audio
|
22 |
from pyannote.core import Segment
|
23 |
import wave
|
24 |
import contextlib
|
25 |
from sklearn.cluster import AgglomerativeClustering
|
26 |
import numpy as np
|
27 |
+
import json
|
28 |
+
from datetime import timedelta
|
29 |
|
30 |
__FILES = set()
|
31 |
|
|
|
141 |
|
142 |
def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
143 |
model = whisper.load_model("medium")
|
144 |
+
# embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
|
145 |
+
embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
|
146 |
"speechbrain/spkrec-ecapa-voxceleb",
|
147 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
148 |
)
|
|
|
160 |
return speaker
|
161 |
|
162 |
# audio = Audio()
|
163 |
+
def diarization(audio):
|
164 |
+
def millisec(timeStr):
|
165 |
+
spl = timeStr.split(":")
|
166 |
+
s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
|
167 |
+
return s
|
168 |
+
as_audio = AudioSegment.from_wav(audio)
|
169 |
+
DEMO_FILE = {'uri': 'blabal', 'audio': audio}
|
170 |
+
hparams = pipeline.parameters(instantiated=True)
|
171 |
+
hparams["segmentation"]["min_duration_off"] -= 0.25
|
172 |
+
pipeline.instantiate(hparams)
|
173 |
+
if num_speakers:
|
174 |
+
dz = pipeline(DEMO_FILE, num_speakers=num_speakers)
|
175 |
+
else:
|
176 |
+
dz = pipeline(DEMO_FILE)
|
177 |
+
with open(CreateFile(f"diarization_{audio}.txt"), "w") as text_file:
|
178 |
+
text_file.write(str(dz))
|
179 |
+
dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
|
180 |
+
print(dz)
|
181 |
+
dzList = []
|
182 |
+
for l in dz:
|
183 |
+
start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
|
184 |
+
start = millisec(start)
|
185 |
+
end = millisec(end)
|
186 |
+
lex = GetSpeaker(re.findall('(SPEAKER_[0-9][0-9])', string=l)[0])
|
187 |
+
dzList.append([start, end, lex])
|
188 |
+
return dzList
|
189 |
+
|
190 |
def get_output(segments):
|
191 |
# print(segments)
|
192 |
conversation=[]
|
193 |
for (i, segment) in enumerate(segments):
|
194 |
# print(f"{i}, {segment["speaker"]}, {segments[i - 1]["speaker"]}, {}")
|
195 |
if not len(conversation):
|
196 |
+
conversation.append([str(timedelta(seconds=float(segment['start']))),str(timedelta(seconds=float(segment['end']))),GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
|
197 |
+
elif conversation[-1][2] == GetSpeaker(segment["speaker"]):
|
198 |
+
conversation[-1][3] += segment["text"].lstrip()
|
199 |
else:
|
200 |
+
conversation.append([str(timedelta(seconds=float(segment['start']))),str(timedelta(seconds=float(segment['end']))),GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
|
201 |
# if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
|
202 |
# if i != 0:
|
203 |
# conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
|
204 |
# conversation[-1][1] += segment["text"][1:]
|
205 |
# return output
|
206 |
+
return ("".join([f"[{start}] - {speaker} \n{text}\n" for start, end, speaker, text in conversation])), ({ "data": [{"start": start, "end":end, "speaker": speaker, "text": text} for start, end, speaker, text in conversation]})
|
207 |
|
208 |
def get_duration(path):
|
209 |
with contextlib.closing(wave.open(path,'r')) as f:
|
|
|
238 |
if duration > 4 * 60 * 60:
|
239 |
return "Audio duration too long"
|
240 |
|
241 |
+
print(json.dumps(diarization(audio)))
|
242 |
result = model.transcribe(audio)
|
243 |
+
print(json.dumps(result))
|
244 |
|
245 |
segments = result["segments"]
|
246 |
|
|
|
264 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
265 |
if not (os.path.isfile("temp_audio.wav")):
|
266 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
267 |
+
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
268 |
else:
|
269 |
raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
|
270 |
|
|
|
280 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
281 |
if not (os.path.isfile("temp_audio.wav")):
|
282 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
283 |
+
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
284 |
else:
|
285 |
raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
|
286 |
+
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
287 |
|
288 |
def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
|
289 |
if retries:
|
|
|
307 |
stream = ffmpeg.input('temp_audio.m4a')
|
308 |
stream = ffmpeg.output(stream, 'temp_audio.wav')
|
309 |
RemoveFile("temp_audio.m4a")
|
310 |
+
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
311 |
else:
|
312 |
raise gr.Error(f"Unable to get video from {URL}")
|
313 |
|
|
|
330 |
# demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
|
331 |
# demo.launch()
|
332 |
with gr.Blocks() as yav_ui:
|
333 |
+
with gr.Row():
|
334 |
+
with gr.Column():
|
335 |
+
with gr.Tab("Youtube", id=1):
|
336 |
+
yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
337 |
+
yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
338 |
+
yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
|
339 |
+
ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
340 |
+
with gr.Tab("Video", id=2):
|
341 |
+
vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
342 |
+
vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
343 |
+
vinput = gr.Video(label="Video")
|
344 |
+
vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
345 |
+
with gr.Tab("Audio", id=3):
|
346 |
+
ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
347 |
+
ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
348 |
+
ainput = gr.Audio(label="Audio", type="filepath")
|
349 |
+
abutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
350 |
+
with gr.Column():
|
351 |
+
with gr.Tab("Text"):
|
352 |
+
output_textbox = gr.Textbox(label="Transcribed Text", lines=15)
|
353 |
+
with gr.Tab("JSON"):
|
354 |
+
output_json = gr.JSON(label="Transcribed JSON")
|
355 |
ybutton_transcribe.click(
|
356 |
fn=YoutubeTranscribe,
|
357 |
inputs=[yinput_nos,yinput_sn,yinput],
|