# import | |
import librosa, torch | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer | |
# load the tokenizer and model | |
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h") | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") | |
# load the audio data (use your own wav file here!) | |
input_audio, sr = librosa.load('my_wav_file.wav', sr=16000) | |
# tokenize | |
input_values = tokenizer(input_audio, return_tensors="pt", padding="longest").input_values | |
# retrieve logits | |
logits = model(input_values).logits | |
# take argmax and decode | |
transcription = tokenizer.batch_decode(torch.argmax(logits, dim=-1)) | |
# print the output | |
print(transcription) | |