from transformers import pipeline
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperFeatureExtractor
import gradio as gr
import librosa

# Prepare model for prediction
MODEL_SPECS_ID = "dmatekenya/whisper-small_finetuned_sh_chich"
MODEL_SPECS_BASE_ID = "openai/whisper-small"
MODEL_SPECS_BASE_LAN_SW = "swahili"
MODEL_SPECS_BASE_LAN_SH = "shona"
FEATURE_EXTRACTOR = WhisperFeatureExtractor.from_pretrained(MODEL_SPECS_ID)
PROCESSOR_SH = WhisperProcessor.from_pretrained(MODEL_SPECS_BASE_ID, 
                                                language=MODEL_SPECS_BASE_LAN_SH, task="transcribe")
MODEL = WhisperForConditionalGeneration.from_pretrained(MODEL_SPECS_ID)


def transcribe(audio_file):
  y, sr = librosa.load(audio_file, sr=16000)

  input_features = PROCESSOR_SH(y, return_tensors="pt", sampling_rate=sr).input_features
  generated_ids = MODEL.generate(inputs=input_features)

  transcription = PROCESSOR_SH.batch_decode(generated_ids, skip_special_tokens=True)[0]

  return transcription


def transcribe_audio(mic=None, file=None):
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    transcription = transcribe(audio_file=audio)
    return transcription

title = "Transcribe Chichewa Audio-Whisper"
description = """
<img src="https://i.ibb.co/5nQdGSs/logo.png">
IN THIS DEMO, TEST THE FIRST AUTOMATED SPEECH RECOGNITION (ASR) MODEL FOR CHICHEWA BY TRANSCRIBING YOUR CHICHEWA VOICE NOTES. 
FOR AUDIO FILES, PLEASE UPLOAD SHORT VOICE NOTES ONLY (NO LONGER THAN 30 SEC).
"""

article = "Read more about the [ChichewaSpeech2Text](https://dmatekenya.github.io/Chichewa-Speech2Text/README.html) project \
and make sure to sign-up for our first [voice note donation event](https://forms.gle/fHLESutofVvb2YFM9) on July 22. \
You stand a chance to win Airtel or TNM units if you choose to participate in the raffle after the event"

gr.Interface(
    fn=transcribe_audio,
    theme='grass',
    title=title,
    description=description,
    article=article,
    inputs=[
        gr.Audio(source="microphone", type="filepath", optional=True),
        gr.Audio(source="upload", type="filepath", optional=True),
    ],
    outputs="text",
).launch()