--- language: WOLOF datasets: - AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF tags: - speech - audio - automatic-speech-recognition license: apache-2.0 metrics: - WER --- ## Evaluation on Common Voice WOLOF Test ```python import pandas as pd from datasets import load_dataset, load_metric,Dataset from tqdm import tqdm import torch import soundfile as sf import torchaudio from transformers import Wav2Vec2ForCTC from transformers import Wav2Vec2Processor from transformers import Wav2Vec2FeatureExtractor from transformers import Wav2Vec2CTCTokenizer model_name = "kingabzpro/wav2vec2-large-xlsr-53-wolof" device = "cuda" model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device) processor = Wav2Vec2Processor.from_pretrained(model_name) val =pd.read_csv("../input/automatic-speech-recognition-in-wolof/Test.csv") val["path"] = "../input/automatic-speech-recognition-in-wolof/Noise Removed/tmp/WOLOF_ASR_dataset/noise_remove/"+val["ID"]+".wav" val.rename(columns = {'transcription':'sentence'}, inplace = True) common_voice_val = Dataset.from_pandas(val) def speech_file_to_array_fn_test(batch): speech_array, sampling_rate = sf.read(batch["path"])#(.wav) 16000 sample rate batch["speech"] = speech_array batch["sampling_rate"] = sampling_rate return batch def prepare_dataset_test(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor(batch["speech"], padding=True,sampling_rate=batch["sampling_rate"][0]).input_values return batch common_voice_val = common_voice_val.remove_columns([ "ID","age", "down_votes", "gender", "up_votes"]) # Remove columns common_voice_val = common_voice_val.map(speech_file_to_array_fn_test, remove_columns=common_voice_val.column_names)# Applying speech_file_to_array function common_voice_val = common_voice_val.map(prepare_dataset_test, remove_columns=common_voice_val.column_names, batch_size=8, num_proc=4, batched=True)# Applying prepare_dataset_test function final_pred = [] for i in tqdm(range(common_voice_val.shape[0])):# Testing model on Wolof Dataset input_dict = processor(common_voice_val[i]["input_values"], return_tensors="pt", padding=True) logits = model(input_dict.input_values.to("cuda")).logits pred_ids = torch.argmax(logits, dim=-1)[0] prediction = processor.decode(pred_ids) final_pred.append(prediction) ``` You can check my result on [Zindi](https://zindi.africa/competitions/ai4d-baamtu-datamation-automatic-speech-recognition-in-wolof/leaderboard), I got 8th rank in AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF **Result**: 7.88 %