classla
/

wav2vec2-xls-r-parlaspeech-hr-lm

Automatic Speech Recognition

Inference Endpoints

Model card Files Files and versions Community

5roop commited on Apr 29, 2022

Commit

e049326

•

1 Parent(s): c669a62

Update README.md

Add versions, correct use example.

Files changed (1) hide show

README.md +12 -18

README.md CHANGED Viewed

@@ -35,40 +35,34 @@ Nikola Ljubešić, Danijel Koržinek, Peter Rupnik, Ivo-Pavao Jazbec. ParlaSpeec
 ## Usage in `transformers`
-So far untested approach that worked before:
 ```python
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 import soundfile as sf
 import torch
 import os
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # load model and tokenizer
-processor = Wav2Vec2Processor.from_pretrained(
     "5roop/wav2vec2-xls-r-parlaspeech-hr-lm")
 model = Wav2Vec2ForCTC.from_pretrained("5roop/wav2vec2-xls-r-parlaspeech-hr-lm")
 # download the example wav files:
-os.system("wget https://huggingface.co/5roop/wav2vec2-xls-r-parlaspeech-hr-lm/raw/main/00020570a.flac.wav")
 # read the wav file
 speech, sample_rate = sf.read("00020570a.flac.wav")
-input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
 # remove the raw wav file
 os.system("rm 00020570a.flac.wav")
-# retrieve logits
-logits = model.to(device)(input_values).logits
-# take argmax and decode
-predicted_ids = torch.argmax(logits, dim=-1)
-transcription = processor.decode(predicted_ids[0]).lower()
-# transcription: 'veliki broj poslovnih subjekata posluje sa minusom velik dio'
 ```

 ## Usage in `transformers`
+Tested with `transformers==4.18.0`, `torch==1.11.0`, and `SoundFile==0.10.3.post1`.
 ```python
+from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
 import soundfile as sf
 import torch
 import os
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # load model and tokenizer
+processor = Wav2Vec2ProcessorWithLM.from_pretrained(
     "5roop/wav2vec2-xls-r-parlaspeech-hr-lm")
 model = Wav2Vec2ForCTC.from_pretrained("5roop/wav2vec2-xls-r-parlaspeech-hr-lm")
 # download the example wav files:
+os.system("wget https://huggingface.co/classla/wav2vec2-large-slavic-parlaspeech-hr/raw/main/00020570a.flac.wav")
 # read the wav file
 speech, sample_rate = sf.read("00020570a.flac.wav")
+input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()
+inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
+with torch.no_grad():
+    logits = model(**inputs).logits
+transcription = processor.batch_decode(logits.numpy()).text[0]
 # remove the raw wav file
 os.system("rm 00020570a.flac.wav")
+transcription
+# transcription: 'velik broj poslovnih subjekata posluje sa minusom velik dio'
 ```