import os os.system("pip install --upgrade transformers accelerate") os.system("pip install tokenizers fairseq") os.system("pip install numpy==1.24") #NumPy 1.24 or less needed by Numba os.system("pip install torch transformers accelerate torchaudio datasets") os.system("pip install librosa==0.9.0") # os.system("pip install gradio==4.16.0") # Rollback to pre 4.17.0 due to gr Audio playback issues os.system("pip install --upgrade gradio") import scipy import gradio as gr from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer from datasets import load_dataset, Audio, Dataset import torch import librosa #For converting audio sample rate to 16k LANG = "dtp" #Change to tih for Timugon Murut or iba for Iban model_id = "facebook/mms-1b-all" processor = AutoProcessor.from_pretrained(model_id) model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu") processor.tokenizer.set_target_lang(LANG) model.load_adapter(LANG) asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text" model_tts = VitsModel.from_pretrained("facebook/mms-tts-dtp") tokenizer_tts = AutoTokenizer.from_pretrained("facebook/mms-tts-dtp") def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio speech, sample_rate = librosa.load(input) speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000) loaded_audio = Dataset.from_dict({"audio": [input]}).cast_column("audio", Audio(sampling_rate=16000)) audio_to_array = loaded_audio[0]["audio"]["array"] return audio_to_array def run(input): inputs = processor(input, sampling_rate=16_000, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs).logits ids = torch.argmax(outputs, dim=-1)[0] transcription = processor.decode(ids) return transcription def transcribe(input): #Gradio UI wrapper function audioarray = preprocess(input) #Call preprocessor function out = run(audioarray) return out with gr.Blocks(theme = gr.themes.Soft()) as demo: gr.HTML( """