whispy commited on
Commit
9cd344f
β€’
1 Parent(s): 9ba61bd

Upload 3 files

Browse files
Files changed (2) hide show
  1. app.py +118 -51
  2. requirements.txt +4 -6
app.py CHANGED
@@ -1,60 +1,127 @@
1
- import gradio as gr
2
  import torch
3
- import whisper
 
 
 
4
  from diffusers import DiffusionPipeline
5
- from transformers import (
6
- WhisperForConditionalGeneration,
7
- WhisperProcessor,
8
- )
9
 
10
- import os
11
- MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')
12
 
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
- model = WhisperForConditionalGeneration.from_pretrained("whispy/whisper_italian").to(device)
15
- processor = WhisperProcessor.from_pretrained("whispy/whisper_italian")
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  diffuser_pipeline = DiffusionPipeline.from_pretrained(
18
  "CompVis/stable-diffusion-v1-4",
19
- custom_pipeline="speech_to_image_diffusion",
20
- speech_model=model,
21
- speech_processor=processor,
22
- use_auth_token=MY_SECRET_TOKEN,
23
- revision="fp16",
24
- torch_dtype=torch.float16,
25
  )
26
 
27
- diffuser_pipeline.enable_attention_slicing()
28
- diffuser_pipeline = diffuser_pipeline.to(device)
29
-
30
-
31
- #β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
32
- # GRADIO SETUP
33
- title = "Speech to Diffusion β€’ Community Pipeline"
34
- description = """
35
- <p style='text-align: center;'>This demo can generate an image from an audio sample using pre-trained OpenAI whisper-small and Stable Diffusion.<br />
36
- Community examples consist of both inference and training examples that have been added by the community.<br />
37
- <a href='https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image' target='_blank'> Click here for more information about community pipelines </a>
38
- </p>
39
- """
40
- article = """
41
- <p style='text-align: center;'>Community pipeline by Mikail Duzenli β€’ Gradio demo by Sylvain Filoni & Ahsen Khaliq<p>
42
- """
43
- audio_input = gr.Audio(source="microphone", type="filepath")
44
- image_output = gr.Image()
45
-
46
- def speech_to_text(audio_sample):
47
-
48
- process_audio = whisper.load_audio(audio_sample)
49
- output = diffuser_pipeline(process_audio)
50
-
51
- print(f"""
52
- β€”β€”β€”β€”β€”β€”β€”β€”
53
- output: {output}
54
- β€”β€”β€”β€”β€”β€”β€”β€”
55
- """)
56
-
57
- return output.images[0]
58
-
59
- demo = gr.Interface(fn=speech_to_text, inputs=audio_input, outputs=image_output, title=title, description=description, article=article)
60
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+
3
+ import gradio as gr
4
+ import pytube as pt
5
+ from transformers import pipeline
6
  from diffusers import DiffusionPipeline
 
 
 
 
7
 
 
 
8
 
9
+ MODEL_NAME = "whispy/whisper_italian"
10
+
11
+ device = 0 if torch.cuda.is_available() else "cpu"
12
+
13
+ summarizer = pipeline(
14
+ "summarization",
15
+ model="it5/it5-efficient-small-el32-news-summarization",
16
+ )
17
+
18
+ pipe = pipeline(
19
+ task="automatic-speech-recognition",
20
+ model=MODEL_NAME,
21
+ chunk_length_s=30,
22
+ device=device,
23
+ )
24
 
25
  diffuser_pipeline = DiffusionPipeline.from_pretrained(
26
  "CompVis/stable-diffusion-v1-4",
27
+ #custom_pipeline="speech_to_image_diffusion",
28
+ #speech_model=model,
29
+ #speech_processor=processor,
30
+ #use_auth_token=MY_SECRET_TOKEN,
31
+ #revision="fp16",
32
+ #torch_dtype=torch.float16,
33
  )
34
 
35
+ #diffuser_pipeline.enable_attention_slicing()
36
+ #diffuser_pipeline = diffuser_pipeline.to(device)
37
+
38
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-it-en")
39
+
40
+ def transcribe(microphone, file_upload):
41
+ warn_output = ""
42
+ if (microphone is not None) and (file_upload is not None):
43
+ warn_output = (
44
+ "WARNING: You've uploaded an audio file and used the microphone. "
45
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
46
+ )
47
+
48
+ elif (microphone is None) and (file_upload is None):
49
+ return "ERROR: You have to either use the microphone or upload an audio file"
50
+
51
+ file = microphone if microphone is not None else file_upload
52
+
53
+ text = pipe(file)["text"]
54
+
55
+ translate = translator(text)
56
+ translate = translate[0]["translation_text"]
57
+
58
+ output = diffuser_pipeline(translate)
59
+ image = output.images[0]
60
+
61
+ return warn_output + text, translate, image
62
+
63
+
64
+ def _return_yt_html_embed(yt_url):
65
+ video_id = yt_url.split("?v=")[-1]
66
+ HTML_str = (
67
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
68
+ " </center>"
69
+ )
70
+ return HTML_str
71
+
72
+
73
+ def yt_transcribe(yt_url):
74
+ yt = pt.YouTube(yt_url)
75
+ html_embed_str = _return_yt_html_embed(yt_url)
76
+ stream = yt.streams.filter(only_audio=True)[0]
77
+ stream.download(filename="audio.mp3")
78
+
79
+ text = pipe("audio.mp3")["text"]
80
+
81
+ summary = summarizer(text)
82
+ summary = summary[0]["summary_text"]
83
+
84
+ translate = translator(summary)
85
+ translate = translate[0]["translation_text"]
86
+
87
+ return html_embed_str, text, summary, translate
88
+
89
+ demo = gr.Blocks()
90
+
91
+ mf_transcribe = gr.Interface(
92
+ fn=transcribe,
93
+ inputs=[
94
+ gr.inputs.Audio(source="microphone", type="filepath", optional=True),
95
+ gr.inputs.Audio(source="upload", type="filepath", optional=True),
96
+ ],
97
+ outputs=["text", "text", "image"],
98
+ layout="horizontal",
99
+ theme="huggingface",
100
+ title="Whisper Demo: Transcribe Audio",
101
+ description=(
102
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
103
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
104
+ " of arbitrary length."
105
+ ),
106
+ allow_flagging="never",
107
+ )
108
+
109
+ yt_transcribe = gr.Interface(
110
+ fn=yt_transcribe,
111
+ inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
112
+ outputs=["html", "text", "text", "text"],
113
+ layout="horizontal",
114
+ theme="huggingface",
115
+ title="Whisper Demo: Transcribe YouTube",
116
+ description=(
117
+ "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
118
+ f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files of"
119
+ " arbitrary length."
120
+ ),
121
+ allow_flagging="never",
122
+ )
123
+
124
+ with demo:
125
+ gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
126
+
127
+ demo.launch(enable_queue=True)
requirements.txt CHANGED
@@ -1,7 +1,5 @@
1
- --extra-index-url https://download.pytorch.org/whl/cu113
2
  torch
3
- scipy
4
- ftfy
5
- git+https://github.com/huggingface/transformers
6
- git+https://github.com/huggingface/diffusers
7
- git+https://github.com/openai/whisper.git
 
1
+ transformers
2
  torch
3
+ pytube
4
+ diffusers
5
+ sentencepiece