Spaces:
Runtime error
Runtime error
salmanmapkar
commited on
Commit
•
8d881b7
1
Parent(s):
2f47bf1
Update app.py
Browse files
app.py
CHANGED
@@ -28,7 +28,7 @@ import json
|
|
28 |
from datetime import timedelta
|
29 |
|
30 |
__FILES = set()
|
31 |
-
|
32 |
|
33 |
def CreateFile(filename):
|
34 |
__FILES.add(filename)
|
@@ -139,14 +139,16 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
|
|
139 |
return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
|
140 |
|
141 |
|
142 |
-
def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
143 |
-
model = whisper.load_model("medium")
|
144 |
# embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
|
|
|
145 |
embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
|
146 |
"speechbrain/spkrec-ecapa-voxceleb",
|
147 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
148 |
)
|
149 |
SPEAKER_DICT = {}
|
|
|
150 |
SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
|
151 |
def GetSpeaker(sp):
|
152 |
speaker = sp
|
@@ -155,6 +157,10 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
155 |
t = SPEAKERS.pop(0)
|
156 |
SPEAKER_DICT[sp] = t
|
157 |
speaker = SPEAKER_DICT[sp]
|
|
|
|
|
|
|
|
|
158 |
else:
|
159 |
speaker = SPEAKER_DICT[sp]
|
160 |
return speaker
|
@@ -253,7 +259,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
253 |
return get_output(segments)
|
254 |
# return output
|
255 |
|
256 |
-
def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
|
257 |
print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
|
258 |
if retries:
|
259 |
# subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
|
@@ -264,11 +270,11 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
|
|
264 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
265 |
if not (os.path.isfile("temp_audio.wav")):
|
266 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
267 |
-
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
268 |
else:
|
269 |
raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
|
270 |
|
271 |
-
def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
|
272 |
if retries:
|
273 |
try:
|
274 |
clip = mp.VideoFileClip(video)
|
@@ -280,12 +286,11 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
|
|
280 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
281 |
if not (os.path.isfile("temp_audio.wav")):
|
282 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
283 |
-
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
284 |
else:
|
285 |
raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
|
286 |
-
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
287 |
|
288 |
-
def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
|
289 |
if retries:
|
290 |
if "youtu" not in URL.lower():
|
291 |
raise gr.Error(f"{URL} is not a valid youtube URL.")
|
@@ -307,42 +312,28 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
|
|
307 |
stream = ffmpeg.input('temp_audio.m4a')
|
308 |
stream = ffmpeg.output(stream, 'temp_audio.wav')
|
309 |
RemoveFile("temp_audio.m4a")
|
310 |
-
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
311 |
else:
|
312 |
raise gr.Error(f"Unable to get video from {URL}")
|
313 |
|
314 |
-
ut = gr.Interface(
|
315 |
-
fn=YoutubeTranscribe,
|
316 |
-
inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
|
317 |
-
outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
|
318 |
-
)
|
319 |
-
vt = gr.Interface(
|
320 |
-
fn=VideoTranscribe,
|
321 |
-
inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
|
322 |
-
outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
|
323 |
-
)
|
324 |
-
at = gr.Interface(
|
325 |
-
fn=AudioTranscribe,
|
326 |
-
inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
|
327 |
-
outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
|
328 |
-
)
|
329 |
|
330 |
-
# demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
|
331 |
-
# demo.launch()
|
332 |
with gr.Blocks() as yav_ui:
|
333 |
with gr.Row():
|
334 |
with gr.Column():
|
335 |
with gr.Tab("Youtube", id=1):
|
|
|
336 |
yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
337 |
yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
338 |
yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
|
339 |
ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
340 |
with gr.Tab("Video", id=2):
|
|
|
341 |
vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
342 |
vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
343 |
vinput = gr.Video(label="Video")
|
344 |
vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
345 |
with gr.Tab("Audio", id=3):
|
|
|
346 |
ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
347 |
ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
348 |
ainput = gr.Audio(label="Audio", type="filepath")
|
|
|
28 |
from datetime import timedelta
|
29 |
|
30 |
__FILES = set()
|
31 |
+
wispher_models = ist(whisper._MODELS.keys())
|
32 |
|
33 |
def CreateFile(filename):
|
34 |
__FILES.add(filename)
|
|
|
139 |
return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
|
140 |
|
141 |
|
142 |
+
def Transcribe_V2(model, num_speakers, speaker_names, audio="temp_audio.wav"):
|
143 |
+
#model = whisper.load_model("medium")
|
144 |
# embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
|
145 |
+
|
146 |
embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
|
147 |
"speechbrain/spkrec-ecapa-voxceleb",
|
148 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
149 |
)
|
150 |
SPEAKER_DICT = {}
|
151 |
+
default_speaker_names = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
|
152 |
SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
|
153 |
def GetSpeaker(sp):
|
154 |
speaker = sp
|
|
|
157 |
t = SPEAKERS.pop(0)
|
158 |
SPEAKER_DICT[sp] = t
|
159 |
speaker = SPEAKER_DICT[sp]
|
160 |
+
elif len(default_speaker_names):
|
161 |
+
t = default_speaker_names.pop(0)
|
162 |
+
SPEAKER_DICT[sp] = t
|
163 |
+
speaker = SPEAKER_DICT[sp]
|
164 |
else:
|
165 |
speaker = SPEAKER_DICT[sp]
|
166 |
return speaker
|
|
|
259 |
return get_output(segments)
|
260 |
# return output
|
261 |
|
262 |
+
def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5, model='base'):
|
263 |
print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
|
264 |
if retries:
|
265 |
# subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
|
|
|
270 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
271 |
if not (os.path.isfile("temp_audio.wav")):
|
272 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
273 |
+
return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
|
274 |
else:
|
275 |
raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
|
276 |
|
277 |
+
def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5, model='base'):
|
278 |
if retries:
|
279 |
try:
|
280 |
clip = mp.VideoFileClip(video)
|
|
|
286 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
287 |
if not (os.path.isfile("temp_audio.wav")):
|
288 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
289 |
+
return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
|
290 |
else:
|
291 |
raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
|
|
|
292 |
|
293 |
+
def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5, model='base'):
|
294 |
if retries:
|
295 |
if "youtu" not in URL.lower():
|
296 |
raise gr.Error(f"{URL} is not a valid youtube URL.")
|
|
|
312 |
stream = ffmpeg.input('temp_audio.m4a')
|
313 |
stream = ffmpeg.output(stream, 'temp_audio.wav')
|
314 |
RemoveFile("temp_audio.m4a")
|
315 |
+
return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
|
316 |
else:
|
317 |
raise gr.Error(f"Unable to get video from {URL}")
|
318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
|
|
|
|
320 |
with gr.Blocks() as yav_ui:
|
321 |
with gr.Row():
|
322 |
with gr.Column():
|
323 |
with gr.Tab("Youtube", id=1):
|
324 |
+
ysz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
|
325 |
yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
326 |
yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
327 |
yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
|
328 |
ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
329 |
with gr.Tab("Video", id=2):
|
330 |
+
vsz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
|
331 |
vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
332 |
vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
333 |
vinput = gr.Video(label="Video")
|
334 |
vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
335 |
with gr.Tab("Audio", id=3):
|
336 |
+
asz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
|
337 |
ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
338 |
ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
339 |
ainput = gr.Audio(label="Audio", type="filepath")
|