example_asr_wav2vec2 / demo.nolm.py
tz579's picture
Training in progress, step 12776
6e42c7f verified
raw
history blame contribute delete
679 Bytes
# import
import librosa, torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
# load the tokenizer and model
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
# load the audio data (use your own wav file here!)
input_audio, sr = librosa.load('my_wav_file.wav', sr=16000)
# tokenize
input_values = tokenizer(input_audio, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
transcription = tokenizer.batch_decode(torch.argmax(logits, dim=-1))
# print the output
print(transcription)