Spaces:

kadirnar
/

Audio-WebUI

Sleeping

App Files Files Community

kadirnar commited on Jun 9

Commit

19d9763

•

1 Parent(s): ec5308a

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -159

app.py CHANGED Viewed

@@ -1,17 +1,19 @@
-import gradio as gr
-from whisperplus.pipelines.whisper import SpeechToTextPipeline
 from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
-from whisperplus.utils.download_utils import download_and_convert_to_mp3
-from whisperplus.utils.text_utils import format_speech_to_dialogue
-import subprocess
-def install_package(package):
-    subprocess.check_call(['pip', 'install', package, '--no-build-isolation'])
-# Then install flash-attn
-install_package('flash-attn')
 def youtube_url_to_text(url, model_id, language_choice):
@@ -26,17 +28,71 @@ def youtube_url_to_text(url, model_id, language_choice):
     Returns:
         transcript (str): The transcript of the speech-to-text conversion.
-        video_path (str): The path of the downloaded video.
     """
-    video_path = download_and_convert_to_mp3(url)
-    output = SpeechToTextPipeline(model_id)
-    print(video_path)
-    transcript = output(audio_path=video_path, language=language_choice)
-    return transcript, video_path
-def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker):
     """
     Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
     a specified model, and returns the transcript along with the video path.
@@ -54,160 +110,104 @@ def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker):
     pipeline = ASRDiarizationPipeline.from_pretrained(
         asr_model=model_id,
         diarizer_model="pyannote/speaker-diarization",
         chunk_length_s=30,
-        device="cuda",
     )
-    audio_path = download_and_convert_to_mp3(url)
     output_text = pipeline(
         audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
     dialogue = format_speech_to_dialogue(output_text)
     return dialogue, audio_path
-def youtube_url_to_text_app():
-    with gr.Blocks():
         with gr.Row():
             with gr.Column():
-                youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
-                language_choice = gr.Dropdown(
-                    choices=[
-                        "English",
-                        "Turkish",
-                        "Spanish",
-                        "French",
-                        "Chinese",
-                        "Japanese",
-                        "Korean",
-                    ],
-                    value="Turkish",
-                    label="Language",
-                )
-                whisper_model_id = gr.Dropdown(
-                    choices=[
-                        "openai/whisper-large-v3",
-                        "openai/whisper-large",
-                        "openai/whisper-medium",
-                        "openai/whisper-base",
-                        "openai/whisper-small",
-                        "openai/whisper-tiny",
-                    ],
-                    value="openai/whisper-large-v3",
-                    label="Whisper Model",
-                )
-                whisperplus_in_predict = gr.Button(value="Generator")
             with gr.Column():
-                output_text = gr.Textbox(label="Output Text")
-                output_audio = gr.Audio(label="Output Audio")
-        whisperplus_in_predict.click(
-            fn=youtube_url_to_text,
-            inputs=[
-                youtube_url_path,
-                whisper_model_id,
-                language_choice,
-            ],
-            outputs=[output_text, output_audio],
-        )
-        gr.Examples(
-            examples=[
-                [
-                    "https://www.youtube.com/watch?v=di3rHkEZuUw",
-                    "distil-whisper/distil-large-v3",
-                    "English",
-                ],
-            ],
-            fn=youtube_url_to_text,
-            inputs=[
-                youtube_url_path,
-                whisper_model_id,
-                language_choice,
-            ],
-            outputs=[output_text, output_audio],
-            cache_examples=True,
-        )
-def speaker_diarization_app():
-    with gr.Blocks():
         with gr.Row():
             with gr.Column():
-                youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
-                whisper_model_id = gr.Dropdown(
-                    choices=[
-                        "openai/whisper-large-v3",
-                        "distil-whisper/distil-large-v3",
-                        "distil-whisper/distil-large-v2",
-                    ],
-                    value="distil-whisper/distil-large-v3",
-                    label="Whisper Model",
-                )
-                num_speakers = gr.Number(value=2, label="Number of Speakers")
-                min_speaker = gr.Number(value=1, label="Minimum Number of Speakers")
-                max_speaker = gr.Number(value=2, label="Maximum Number of Speakers")
-                whisperplus_in_predict = gr.Button(value="Generator")
             with gr.Column():
-                output_text = gr.Textbox(label="Output Text")
-                output_audio = gr.Audio(label="Output Audio")
-        whisperplus_in_predict.click(
-            fn=speaker_diarization,
-            inputs=[
-                youtube_url_path,
-                whisper_model_id,
-                num_speakers,
-                min_speaker,
-                max_speaker,
-            ],
-            outputs=[output_text, output_audio],
-        )
-        gr.Examples(
-            examples=[
-                [
-                    "https://www.youtube.com/shorts/o8PgLUgte2k",
-                    "distil-whisper/distil-large-v3",
-                    2,
-                    1,
-                    2,
-                ],
-            ],
-            fn=speaker_diarization,
-            inputs=[
-                youtube_url_path,
-                whisper_model_id,
-                num_speakers,
-                min_speaker,
-                max_speaker,
-            ],
-            outputs=[output_text, output_audio],
-            cache_examples=False,
-        )
-gradio_app = gr.Blocks()
-with gradio_app:
-    gr.HTML(
-        """
-    <h1 style='text-align: center'>
-    WhisperPlus: Advancing Speech-to-Text Processing 🚀
-    </h1>
-    """)
-    gr.HTML(
-        """
-        <h3 style='text-align: center'>
-        Follow me for more!
-        <a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a>  | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a>
-        </h3>
-        """)
-    with gr.Row():
-        with gr.Column():
-            with gr.Tab(label="Youtube URL to Text"):
-                youtube_url_to_text_app()
-            with gr.Tab(label="Speaker Diarization"):
-                speaker_diarization_app()
-gradio_app.launch(debug=True)

+import gradio as gr
+import torch
+from transformers import BitsAndBytesConfig, HqqConfig
+from whisperplus import (
+    SpeechToTextPipeline,
+    download_youtube_to_mp3,
+    download_youtube_to_mp4,
+    format_speech_to_dialogue,
+)
+from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
+from whisperplus.pipelines.summarization import TextSummarizationPipeline
+from whisperplus.pipelines.text2speech import TextToSpeechPipeline
+from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
 from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
 def youtube_url_to_text(url, model_id, language_choice):
     Returns:
         transcript (str): The transcript of the speech-to-text conversion.
     """
+    audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")
+    hqq_config = HqqConfig(
+        nbits=4,
+        group_size=64,
+        quant_zero=False,
+        quant_scale=False,
+        axis=0,
+        offload_meta=False,
+    )  # axis=0 is used by default
+    pipeline = SpeechToTextPipeline(
+        model_id=model_id,
+        quant_config=hqq_config,
+        flash_attention_2=True,
+    )
+    transcript = pipeline(
+        audio_path=audio_path,
+        chunk_length_s=30,
+        stride_length_s=5,
+        max_new_tokens=128,
+        batch_size=100,
+        language=language_choice,
+        return_timestamps=False,
+    )
+    return transcript
+def summarization(text, model_id="facebook/bart-large-cnn"):
+    """
+    Main function that performs summarization using a specified model and returns the summary.
+    Args:
+        text (str): The text to summarize.
+        model_id (str): The ID of the summarization model to use.
+    Returns:
+        summary (str): The summary of the text.
+    """
+    summarizer = TextSummarizationPipeline(model_id=model_id)
+    summary = summarizer.summarize(text)
+    return summary[0]["summary_text"]
+def long_text_summarization(text, model_id="facebook/bart-large-cnn"):
+    """
+    Main function that performs summarization using a specified model and returns the summary.
+    Args:
+        text (str): The text to summarize.
+        model_id (str): The ID of the summarization model to use.
+    Returns:
+        summary (str): The summary of the text.
+    """
+    summarizer = LongTextSummarizationPipeline(model_id=model_id)
+    summary_text = summarizer.summarize(text)
+    return summary_text
+def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
     """
     Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
     a specified model, and returns the transcript along with the video path.
     pipeline = ASRDiarizationPipeline.from_pretrained(
         asr_model=model_id,
         diarizer_model="pyannote/speaker-diarization",
+        use_auth_token=False,
         chunk_length_s=30,
+        device=device,
     )
+    audio_path = download_youtube_to_mp3(url)
     output_text = pipeline(
         audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
     dialogue = format_speech_to_dialogue(output_text)
     return dialogue, audio_path
+def text2spech_bark(text, model_id="suno/bark", voice_preset="v2/en_speaker_6"):
+    tts = TextToSpeechPipeline(model_id=model_id)
+    audio = tts(text=text, voice_preset=voice_preset)
+    return audio
+def whisper_autocaption(url, language, model_id="openai/whisper-large-v3"):
+    video_path = download_youtube_to_mp4(url)
+    caption = WhisperAutoCaptionPipeline(model_id=model_id)
+    output = caption(video_path=video_path, output_path="output.mp4", language=language)
+    return output
+with gr.Blocks() as demo:
+    with gr.Tab("YouTube URL to Text"):
+        with gr.Row():
+            with gr.Column():
+                url_input = gr.Textbox(label="Enter YouTube URL")
+                model_id_input = gr.Textbox(label="Enter Model ID", value="openai/whisper-medium")
+                language_input = gr.Textbox(label="Enter Language", value="en")
+                submit_btn1 = gr.Button("Submit")
+            with gr.Column():
+                output1 = gr.Textbox(label="Transcript")
+        submit_btn1.click(
+            youtube_url_to_text, inputs=[url_input, model_id_input, language_input], outputs=output1)
+    with gr.Tab("Text Summarization"):
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(label="Enter Text", lines=5)
+                model_id_input2 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
+                submit_btn2 = gr.Button("Summarize")
+            with gr.Column():
+                output2 = gr.Textbox(label="Summary")
+        submit_btn2.click(summarization, inputs=[text_input, model_id_input2], outputs=output2)
+    with gr.Tab("Long Text Summarization"):
         with gr.Row():
             with gr.Column():
+                long_text_input = gr.Textbox(label="Enter Long Text", lines=10)
+                model_id_input3 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
+                submit_btn3 = gr.Button("Summarize Long Text")
+            with gr.Column():
+                output3 = gr.Textbox(label="Long Text Summary")
+        submit_btn3.click(long_text_summarization, inputs=[long_text_input, model_id_input3], outputs=output3)
+    with gr.Tab("Speaker Diarization"):
+        with gr.Row():
             with gr.Column():
+                url_input2 = gr.Textbox(label="Enter YouTube URL")
+                model_id_input4 = gr.Textbox(label="Enter Model ID")
+                num_speakers = gr.Number(label="Number of Speakers", value=2)
+                min_speakers = gr.Number(label="Min Speakers", value=1)
+                max_speakers = gr.Number(label="Max Speakers", value=4)
+                device = gr.Textbox(label="Device", value="cpu")
+                submit_btn4 = gr.Button("Diarize")
+            with gr.Column():
+                output4 = gr.DataFrame(headers=["Speaker", "Text"], datatype=["str", "str"])
+        submit_btn4.click(
+            speaker_diarization,
+            inputs=[url_input2, model_id_input4, device, num_speakers, min_speakers, max_speakers],
+            outputs=output4)
+    with gr.Tab("Text to Speech"):
         with gr.Row():
             with gr.Column():
+                text_input2 = gr.Textbox(label="Enter Text", lines=3)
+                model_id_input5 = gr.Textbox(label="Enter Model ID", value="suno/bark")
+                voice_preset = gr.Textbox(label="Voice Preset", value="v2/en_speaker_6")
+                submit_btn5 = gr.Button("Generate Audio")
+            with gr.Column():
+                output5 = gr.Audio(label="Generated Audio")
+        submit_btn5.click(
+            text2spech_bark, inputs=[text_input2, model_id_input5, voice_preset], outputs=output5)
+    with gr.Tab("Whisper Autocaption"):
+        with gr.Row():
+            with gr.Column():
+                url_input3 = gr.Textbox(label="Enter YouTube URL")
+                language = gr.Textbox(label="Language", value="en")
+                model_id_input6 = gr.Textbox(label="Enter Model ID", value="openai/whisper-large-v2")
+                submit_btn6 = gr.Button("Generate Captions")
             with gr.Column():
+                output6 = gr.Video(label="Captioned Video")
+        submit_btn6.click(
+            whisper_autocaption, inputs=[url_input3, language, model_id_input6], outputs=output6)
+demo.launch()