Spaces:

amirza
/

draw_me_a_sheep_heb

Runtime error

App Files Files Community

Amir Zait commited on Aug 19, 2022

Commit

e8b13db

•

1 Parent(s): d5d060c

fixes

Browse files

Files changed (3) hide show

app.py +7 -30
image_generator.py +3 -5
requirements.txt +0 -2

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ from transformers import pipeline
 import soundfile as sf
 import gradio as gr
-import librosa
 import torch
 import sox
 import os
@@ -18,32 +17,6 @@ asr_model = AutoModelForCTC.from_pretrained("imvladikon/wav2vec2-xls-r-300m-hebr
 he_en_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-he-en")
-def process_audio_file(file):
-    data, sr = librosa.load(file)
-    if sr != 16000:
-        data = librosa.resample(data, sr, 16000)
-    input_values = asr_processor(data, sampling_rate=16_000, return_tensors="pt").input_values #.to(device)
-    return input_values
-def transcribe(file_mic, file_upload):
-    warn_output = ""
-    if (file_mic is not None) and (file_upload is not None):
-       warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
-       file = file_mic
-    elif (file_mic is None) and (file_upload is None):
-       return "ERROR: You have to either use the microphone or upload an audio file"
-    elif file_mic is not None:
-       file = file_mic
-    else:
-       file = file_upload
-    input_values = process_audio_file(file)
-    logits = asr_model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = asr_processor.decode(predicted_ids[0], skip_special_tokens=True)
-    return warn_output + transcription
 def convert(inputfile, outfile):
     sox_tfm = sox.Transformer()
     sox_tfm.set_output_format(
@@ -52,22 +25,26 @@ def convert(inputfile, outfile):
     sox_tfm.build(inputfile, outfile)
 def parse_transcription(wav_file):
     filename = wav_file.name.split('.')[0]
     convert(wav_file.name, filename + "16k.wav")
     speech, _ = sf.read(filename + "16k.wav")
-    print(speech.shape)
     input_values = asr_processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
     logits = asr_model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = asr_processor.decode(predicted_ids[0], skip_special_tokens=True)
-    translated = he_en_translator(transcription)[0]['translation_text']
     image = generate_image(translated)
     return image
 output = gr.outputs.Image(label='')
 input_mic = gr.inputs.Audio(source="microphone", type="file", optional=True)
-input_upload = gr.inputs.Audio(source="upload", type="file", optional=True)
 gr.Interface(parse_transcription, inputs=[input_mic],  outputs=output,
              analytics_enabled=False,

 import soundfile as sf
 import gradio as gr
 import torch
 import sox
 import os
 he_en_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-he-en")
 def convert(inputfile, outfile):
     sox_tfm = sox.Transformer()
     sox_tfm.set_output_format(
     sox_tfm.build(inputfile, outfile)
 def parse_transcription(wav_file):
+    # Get the wav file from the microphone
     filename = wav_file.name.split('.')[0]
     convert(wav_file.name, filename + "16k.wav")
     speech, _ = sf.read(filename + "16k.wav")
+    # transcribe to hebrew
     input_values = asr_processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
     logits = asr_model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = asr_processor.decode(predicted_ids[0], skip_special_tokens=True)
+    # translate to english
+    translated = he_en_translator(transcription)[0]['translation_text']
+    # generate image
     image = generate_image(translated)
     return image
 output = gr.outputs.Image(label='')
 input_mic = gr.inputs.Audio(source="microphone", type="file", optional=True)
 gr.Interface(parse_transcription, inputs=[input_mic],  outputs=output,
              analytics_enabled=False,

image_generator.py CHANGED Viewed

@@ -7,13 +7,11 @@ from dalle_mini import DalleBart, DalleBartProcessor
 from vqgan_jax.modeling_flax_vqgan import VQModel
 # Model references
-# dalle-mega
-DALLE_MODEL = "dalle-mini/dalle-mini/mega-1-fp16:latest"  # can be wandb artifact or 🤗 Hub or local folder or google bucket
 DALLE_COMMIT_ID = None
-# if the notebook crashes too often you can use dalle-mini instead by uncommenting below line
-# DALLE_MODEL = "dalle-mini/dalle-mini/mini-1:v0"
 # VQGAN model
 VQGAN_REPO = "dalle-mini/vqgan_imagenet_f16_16384"
 VQGAN_COMMIT_ID = "e93a26e7707683d349bf5d5c41c5b0ef69b677a9"

 from vqgan_jax.modeling_flax_vqgan import VQModel
 # Model references
+# dalle-mini, mega too large
+# DALLE_MODEL = "dalle-mini/dalle-mini/mega-1-fp16:latest"  # can be wandb artifact or 🤗 Hub or local folder or google bucket
+DALLE_MODEL = "dalle-mini/dalle-mini/mini-1:v0"
 DALLE_COMMIT_ID = None
 # VQGAN model
 VQGAN_REPO = "dalle-mini/vqgan_imagenet_f16_16384"
 VQGAN_COMMIT_ID = "e93a26e7707683d349bf5d5c41c5b0ef69b677a9"

requirements.txt CHANGED Viewed

@@ -1,10 +1,8 @@
 gradio
-librosa
 soundfile
 torch
 transformers
 sox
-sentencepiece
 dalle-mini
 Pillow
 numpy

 gradio
 soundfile
 torch
 transformers
 sox
 dalle-mini
 Pillow
 numpy