patrickvonplaten commited on
Commit
873b685
1 Parent(s): f272fb3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -12
README.md CHANGED
@@ -28,20 +28,18 @@ The only change from the existing ASR pipeline will be:
28
 
29
  ```diff
30
  import torch
31
- import torchaudio.functional as F
32
- -from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
33
- +from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
34
  from datasets import load_dataset
 
 
35
 
36
- ds = load_dataset("common_voice", "es", split="test", streaming=True)
37
 
38
- sample = next(iter(ds))
39
 
40
- resampled_audio = F.resample(torch.tensor(sample["audio"]["array"]), 48_000, 16_000).n
 
41
 
42
- model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
43
- -processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
44
- +processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
45
 
46
  input_values = processor(resampled_audio, return_tensors="pt").input_values
47
 
@@ -50,9 +48,8 @@ with torch.no_grad():
50
 
51
  -prediction_ids = torch.argmax(logits, dim=-1)
52
  -transcription = processor.batch_decode(prediction_ids)
53
- +transcription = processor.batch_decode(logits.cpu().numpy()).text
54
-
55
- print(transcription)
56
  ```
57
 
58
  **Improvement**
 
28
 
29
  ```diff
30
  import torch
 
 
 
31
  from datasets import load_dataset
32
+ from transformers import AutoModelForCTC, AutoProcessor
33
+ import torchaudio.functional as F
34
 
 
35
 
36
+ model_id = "patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm"
37
 
38
+ sample = next(iter(load_dataset("common_voice", "es", split="test", streaming=True)))
39
+ resampled_audio = F.resample(torch.tensor(sample["audio"]["array"]), 48_000, 16_000).numpy()
40
 
41
+ model = AutoModelForCTC.from_pretrained(model_id)
42
+ processor = AutoProcessor.from_pretrained(model_id)
 
43
 
44
  input_values = processor(resampled_audio, return_tensors="pt").input_values
45
 
 
48
 
49
  -prediction_ids = torch.argmax(logits, dim=-1)
50
  -transcription = processor.batch_decode(prediction_ids)
51
+ +transcription = processor.batch_decode(logits.numpy()).text
52
+ # => 'bien y qué regalo vas a abrir primero'
 
53
  ```
54
 
55
  **Improvement**