File size: 686 Bytes
6e42c7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# import
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
# load the processor
processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
# load the audio data (use your own wav file here!)
input_audio, sr = librosa.load('my_wav_file.wav', sr=16000)
# tokenize
input_values = processor(input_audio, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# decode using n-gram
transcription = processor.batch_decode(logits.detach().numpy()).text
# print the output
print(transcription)
|