Spaces:

whispy
/

Whisper-Image

Runtime error

App Files Files Community

whispy commited on Dec 7, 2022

Commit

1cd9790

1 Parent(s): 1e4fa1e

Upload app.py

Browse files

Files changed (1) hide show

app.py +28 -20

app.py CHANGED Viewed

@@ -6,23 +6,26 @@ from diffusers import StableDiffusionPipeline
 MODEL_NAME = "whispy/whisper_italian"
-summarizer = pipeline(
-    "summarization",
-    model="it5/it5-efficient-small-el32-news-summarization",
-)
-pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     chunk_length_s=30,
-    device="cpu",
-)
-YOUR_TOKEN="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"
-image_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
-translator = pipeline("translation", model="Helsinki-NLP/opus-mt-it-en")
 def transcribe(microphone, file_upload):
     warn_output = ""
@@ -37,12 +40,15 @@ def transcribe(microphone, file_upload):
     file = microphone if microphone is not None else file_upload
-    text = pipe(file)["text"]
-    translate = translator(text)
     translate = translate[0]["translation_text"]
-    image = image_pipe(translate)["sample"][0]
     return warn_output + text, translate, image
@@ -80,7 +86,9 @@ mf_transcribe = gr.Interface(
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Audio(source="upload", type="filepath", optional=True),
     ],
-    outputs=["text", "text", "image"],
     layout="horizontal",
     theme="huggingface",
     title="Whisper Demo: Transcribe Audio",

 MODEL_NAME = "whispy/whisper_italian"
+YOUR_TOKEN="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"
+# whisper model fine-tuned for italian
+speech_ppl = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     chunk_length_s=30,
+    device="cpu"
+    )
+# model summarizing text
+summarizer_ppl = pipeline(
+    "summarization",
+    model="it5/it5-efficient-small-el32-news-summarization"
+    )
+# model translating text from Italian to English
+translator_ppl = pipeline(
+    "translation",
+    model="Helsinki-NLP/opus-mt-it-en"
+    )
+# model producing an image from text
+image_ppl = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
 def transcribe(microphone, file_upload):
     warn_output = ""
     file = microphone if microphone is not None else file_upload
+    text = speech_ppl(file)["text"]
+    print("Text: ", text)
+    translate = translator_ppl(text)
+    print("Translate: ", translate)
     translate = translate[0]["translation_text"]
+    print("Translate 2: ", translate)
+    image = image_ppl(translate).images[0]
+    print("Image: ", image)
+    image.save("text-to-image.png")
     return warn_output + text, translate, image
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Audio(source="upload", type="filepath", optional=True),
     ],
+    outputs=[gr.Textbox(label="Transcribed text"),
+             gr.Textbox(label="Summarized text"),
+             gr.Image(type="pil", label="Output image")],
     layout="horizontal",
     theme="huggingface",
     title="Whisper Demo: Transcribe Audio",