Spaces:

AIDHD
/

audio-video-transcriber

Runtime error

App Files Files Community

salmanmapkar commited on Dec 29, 2022

Commit

8d881b7

•

1 Parent(s): 2f47bf1

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -27

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ import json
 from datetime import timedelta
 __FILES = set()
 def CreateFile(filename):
     __FILES.add(filename)
@@ -139,14 +139,16 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
     return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
-def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
-    model = whisper.load_model("medium")
     # embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
     embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
         "speechbrain/spkrec-ecapa-voxceleb",
         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     )
     SPEAKER_DICT = {}
     SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
     def GetSpeaker(sp):
         speaker = sp
@@ -155,6 +157,10 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
                 t = SPEAKERS.pop(0)
                 SPEAKER_DICT[sp] = t
                 speaker = SPEAKER_DICT[sp]
         else:
             speaker = SPEAKER_DICT[sp]
         return speaker
@@ -253,7 +259,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
     return get_output(segments)
     # return output
-def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
     print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
     if retries:
         # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
@@ -264,11 +270,11 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
             return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
         if not (os.path.isfile("temp_audio.wav")):
             return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
-        return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
-def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
     if retries:
         try:
             clip = mp.VideoFileClip(video)
@@ -280,12 +286,11 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
             return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
         if not (os.path.isfile("temp_audio.wav")):
             return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
-        return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
-    return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
-def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
     if retries:
         if "youtu" not in URL.lower():
             raise gr.Error(f"{URL} is not a valid youtube URL.")
@@ -307,42 +312,28 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
             stream = ffmpeg.input('temp_audio.m4a')
             stream = ffmpeg.output(stream, 'temp_audio.wav')
             RemoveFile("temp_audio.m4a")
-            return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error(f"Unable to get video from {URL}")
-ut = gr.Interface(
-    fn=YoutubeTranscribe,
-    inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
-    outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
-)
-vt = gr.Interface(
-    fn=VideoTranscribe,
-    inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
-    outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
-)
-at = gr.Interface(
-    fn=AudioTranscribe,
-    inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
-    outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
-)
-# demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
-# demo.launch()
 with gr.Blocks() as yav_ui:
     with gr.Row():
         with gr.Column():
             with gr.Tab("Youtube", id=1):
                 yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
                 yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
                 yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
                 ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
             with gr.Tab("Video", id=2):
                 vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
                 vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
                 vinput = gr.Video(label="Video")
                 vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
             with gr.Tab("Audio", id=3):
                 ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
                 ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
                 ainput = gr.Audio(label="Audio", type="filepath")

 from datetime import timedelta
 __FILES = set()
+wispher_models = ist(whisper._MODELS.keys())
 def CreateFile(filename):
     __FILES.add(filename)
     return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
+def Transcribe_V2(model, num_speakers, speaker_names, audio="temp_audio.wav"):
+    #model = whisper.load_model("medium")
     # embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
     embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
         "speechbrain/spkrec-ecapa-voxceleb",
         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     )
     SPEAKER_DICT = {}
+    default_speaker_names = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
     SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
     def GetSpeaker(sp):
         speaker = sp
                 t = SPEAKERS.pop(0)
                 SPEAKER_DICT[sp] = t
                 speaker = SPEAKER_DICT[sp]
+            elif len(default_speaker_names):
+                t = default_speaker_names.pop(0)
+                SPEAKER_DICT[sp] = t
+                speaker = SPEAKER_DICT[sp]
         else:
             speaker = SPEAKER_DICT[sp]
         return speaker
     return get_output(segments)
     # return output
+def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5, model='base'):
     print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
     if retries:
         # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
             return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
         if not (os.path.isfile("temp_audio.wav")):
             return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
+        return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
+def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5, model='base'):
     if retries:
         try:
             clip = mp.VideoFileClip(video)
             return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
         if not (os.path.isfile("temp_audio.wav")):
             return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
+        return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
+def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5, model='base'):
     if retries:
         if "youtu" not in URL.lower():
             raise gr.Error(f"{URL} is not a valid youtube URL.")
             stream = ffmpeg.input('temp_audio.m4a')
             stream = ffmpeg.output(stream, 'temp_audio.wav')
             RemoveFile("temp_audio.m4a")
+            return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error(f"Unable to get video from {URL}")
 with gr.Blocks() as yav_ui:
     with gr.Row():
         with gr.Column():
             with gr.Tab("Youtube", id=1):
+                ysz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
                 yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
                 yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
                 yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
                 ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
             with gr.Tab("Video", id=2):
+                vsz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
                 vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
                 vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
                 vinput = gr.Video(label="Video")
                 vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
             with gr.Tab("Audio", id=3):
+                asz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
                 ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
                 ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
                 ainput = gr.Audio(label="Audio", type="filepath")