Drop common voice and update rtfx

#17
by sanchit-gandhi - opened
Files changed (5) hide show
  1. README.md +1 -1
  2. app.py +5 -5
  3. constants.py +34 -17
  4. init.py +1 -2
  5. utils_display.py +1 -2
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ†
4
  colorFrom: red
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 3.42.0
8
  app_file: app.py
9
  pinned: true
10
  tags:
 
4
  colorFrom: red
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: true
10
  tags:
app.py CHANGED
@@ -6,12 +6,12 @@ from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub
6
  from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message
7
  from datetime import datetime, timezone
8
 
9
- LAST_UPDATED = "Feb 10th 2024"
10
 
11
  column_names = {
12
  "MODEL": "Model",
13
  "Avg. WER": "Average WER ⬇️",
14
- "RTF": "RTF (1e-3) ⬇️",
15
  "AMI WER": "AMI",
16
  "Earnings22 WER": "Earnings22",
17
  "Gigaspeech WER": "Gigaspeech",
@@ -20,7 +20,7 @@ column_names = {
20
  "SPGISpeech WER": "SPGISpeech",
21
  "Tedlium WER": "Tedlium",
22
  "Voxpopuli WER": "Voxpopuli",
23
- "Common Voice WER": "Common Voice 9"}
24
 
25
  eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
26
 
@@ -111,7 +111,6 @@ with gr.Blocks() as demo:
111
  leaderboard_table = gr.components.Dataframe(
112
  value=original_df,
113
  datatype=TYPES,
114
- max_rows=None,
115
  elem_id="leaderboard-table",
116
  interactive=False,
117
  visible=True,
@@ -143,6 +142,7 @@ with gr.Blocks() as demo:
143
  value=CITATION_TEXT, lines=7,
144
  label="Copy the BibTeX snippet to cite this source",
145
  elem_id="citation-button",
146
- ).style(show_copy_button=True)
 
147
 
148
  demo.launch()
 
6
  from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message
7
  from datetime import datetime, timezone
8
 
9
+ LAST_UPDATED = "Aug 12th 2024"
10
 
11
  column_names = {
12
  "MODEL": "Model",
13
  "Avg. WER": "Average WER ⬇️",
14
+ "Avg. RTFx": "RTFx ⬆️️",
15
  "AMI WER": "AMI",
16
  "Earnings22 WER": "Earnings22",
17
  "Gigaspeech WER": "Gigaspeech",
 
20
  "SPGISpeech WER": "SPGISpeech",
21
  "Tedlium WER": "Tedlium",
22
  "Voxpopuli WER": "Voxpopuli",
23
+ }
24
 
25
  eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
26
 
 
111
  leaderboard_table = gr.components.Dataframe(
112
  value=original_df,
113
  datatype=TYPES,
 
114
  elem_id="leaderboard-table",
115
  interactive=False,
116
  visible=True,
 
142
  value=CITATION_TEXT, lines=7,
143
  label="Copy the BibTeX snippet to cite this source",
144
  elem_id="citation-button",
145
+ show_copy_button=True,
146
+ )
147
 
148
  demo.launch()
constants.py CHANGED
@@ -15,7 +15,7 @@ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body>
15
 
16
  INTRODUCTION_TEXT = "πŸ“ The πŸ€— Open ASR Leaderboard ranks and evaluates speech recognition models \
17
  on the Hugging Face Hub. \
18
- \nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (⬇️) and [RTF](https://openvoice-tech.net/index.php/Real-time-factor) (⬇️) - lower the better. Models are ranked based on their Average WER, from lowest to highest. Check the πŸ“ˆ Metrics tab to understand how the models are evaluated. \
19
  \nIf you want results for a model that is not listed here, you can submit a request for it to be included βœ‰οΈβœ¨. \
20
  \nThe leaderboard currently focuses on English speech recognition, and will be expanded to multilingual evaluation in later versions."
21
 
@@ -33,34 +33,52 @@ Here you will find details about the speech recognition metrics and datasets rep
33
 
34
  ## Metrics
35
 
36
- 🎯 Word Error Rate (WER) and Real-Time Factor (RTF) are popular metrics for evaluating the accuracy of speech recognition
37
- models by estimating how accurate the predictions from the models are and how fast they are returned. We explain them each
38
- below.
 
 
 
 
 
 
39
 
40
  ### Word Error Rate (WER)
41
 
42
  Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
43
  of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
44
 
45
- ```
46
- Example: If the reference transcript is "I really love cats," and the ASR system outputs "I don't love dogs,".
47
- The WER would be `50%` because 2 out of 4 words are incorrect.
48
- ```
49
 
50
- For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
 
 
 
51
 
52
- ### Real Time Factor (RTF)
 
 
 
53
 
54
- Real Time Factor is a measure of the **latency** of automatic speech recognition systems, i.e. how long it takes an
55
- model to process a given amount of speech. It's usually expressed as a multiple of real time. An RTF of 1 means it processes
56
- speech as fast as it's spoken, while an RTF of 2 means it takes twice as long. Thus, **a lower RTF value indicates lower latency**.
57
 
58
  ```
59
- Example: If it takes an ASR system 10 seconds to transcribe 10 seconds of speech, the RTF is 1.
60
- If it takes 20 seconds to transcribe the same 10 seconds of speech, the RTF is 2.
 
 
 
 
 
 
 
61
  ```
 
 
62
 
63
- For the benchmark, we report RTF averaged over a 10 minute audio sample that is chunked into 30 second segments, mimicking the [chunked long-form transcription strategy](https://huggingface.co/blog/asr-chunking) performed in Transfomrmers. We measure RTF on an A100 80GB GPU (Driver Version: 535.54.03, CUDA Version: 12.2), performing 5 warm-up runs and 3 graded runs, over which the RTF is averaged to get the final result.
 
64
 
65
  ## How to reproduce our results
66
 
@@ -86,7 +104,6 @@ are ranked based on their average WER scores, from lowest to highest.
86
  | Dataset | Domain | Speaking Style | Train (h) | Dev (h) | Test (h) | Transcriptions | License |
87
  |-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
88
  | [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) | Audiobook | Narrated | 960 | 11 | 11 | Normalised | CC-BY-4.0 |
89
- | [Common Voice 9](https://huggingface.co/datasets/mozilla-foundation/common_voice_9_0) | Wikipedia | Narrated | 1409 | 27 | 27 | Punctuated & Cased | CC0-1.0 |
90
  | [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | European Parliament | Oratory | 523 | 5 | 5 | Punctuated | CC0 |
91
  | [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium) | TED talks | Oratory | 454 | 2 | 3 | Normalised | CC-BY-NC-ND 3.0 |
92
  | [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500 | 12 | 40 | Punctuated | apache-2.0 |
 
15
 
16
  INTRODUCTION_TEXT = "πŸ“ The πŸ€— Open ASR Leaderboard ranks and evaluates speech recognition models \
17
  on the Hugging Face Hub. \
18
+ \nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (⬇️ lower the better) and [RTFx](https://github.com/NVIDIA/DeepLearningExamples/blob/master/Kaldi/SpeechRecognition/README.md#metrics) (⬆️ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the πŸ“ˆ Metrics tab to understand how the models are evaluated. \
19
  \nIf you want results for a model that is not listed here, you can submit a request for it to be included βœ‰οΈβœ¨. \
20
  \nThe leaderboard currently focuses on English speech recognition, and will be expanded to multilingual evaluation in later versions."
21
 
 
33
 
34
  ## Metrics
35
 
36
+ Models are evaluated jointly using the Word Error Rate (WER) and Inverse Real Time Factor (RTFx) metrics. The WER metric
37
+ is used to assess the accuracy of a system, and the RTFx the inference speed. Models are ranked in the leaderboard based
38
+ on their WER, lowest to highest.
39
+
40
+ Crucially, the WER and RTFx values are computed for the same inference run using a single script. The implication of this is two-fold:
41
+ 1. The WER and RTFx values are coupled: for a given WER, one can expect to achieve the corresponding RTFx. This allows the proposer to trade-off lower WER for higher RTFx should they wish.
42
+ 2. The WER and RTFx values are averaged over all audios in the benchmark (in the order of thousands of audios).
43
+
44
+ For details on reproducing the benchmark numbers, refer to the [Open ASR GitHub repository](https://github.com/huggingface/open_asr_leaderboard#evaluate-a-model).
45
 
46
  ### Word Error Rate (WER)
47
 
48
  Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
49
  of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
50
 
51
+ Take the following example:
 
 
 
52
 
53
+ | Reference: | the | cat | sat | on | the | mat |
54
+ |-------------|-----|-----|---------|-----|-----|-----|
55
+ | Prediction: | the | cat | **sit** | on | the | | |
56
+ | Label: | βœ… | βœ… | S | βœ… | βœ… | D |
57
 
58
+ Here, we have:
59
+ * 1 substitution ("sit" instead of "sat")
60
+ * 0 insertions
61
+ * 1 deletion ("mat" is missing)
62
 
63
+ This gives 2 errors in total. To get our word error rate, we divide the total number of errors (substitutions + insertions + deletions) by the total number of words in our
64
+ reference (N), which for this example is 6:
 
65
 
66
  ```
67
+ WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
68
+ ```
69
+
70
+ Giving a WER of 0.33, or 33%. For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints, meaning punctuation and casing is removed from the references and predictions. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
71
+
72
+ ### Inverse Real Time Factor (RTFx)
73
+
74
+ Inverse Real Time Factor is a measure of the **latency** of automatic speech recognition systems, i.e. how long it takes an
75
+ model to process a given amount of speech. It is defined as:
76
  ```
77
+ RTFx = (number of seconds of audio inferred) / (compute time in seconds)
78
+ ```
79
 
80
+ Therefore, and RTFx of 1 means a system processes speech as fast as it's spoken, while an RTFx of 2 means it takes half the time.
81
+ Thus, **a higher RTFx value indicates lower latency**.
82
 
83
  ## How to reproduce our results
84
 
 
104
  | Dataset | Domain | Speaking Style | Train (h) | Dev (h) | Test (h) | Transcriptions | License |
105
  |-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
106
  | [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) | Audiobook | Narrated | 960 | 11 | 11 | Normalised | CC-BY-4.0 |
 
107
  | [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | European Parliament | Oratory | 523 | 5 | 5 | Punctuated | CC0 |
108
  | [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium) | TED talks | Oratory | 454 | 2 | 3 | Normalised | CC-BY-NC-ND 3.0 |
109
  | [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500 | 12 | 40 | Punctuated | apache-2.0 |
init.py CHANGED
@@ -14,7 +14,6 @@ hf_api = HfApi(
14
 
15
  def load_all_info_from_dataset_hub():
16
  eval_queue_repo = None
17
- results_csv_path = None
18
  requested_models = None
19
 
20
  passed = True
@@ -40,7 +39,7 @@ def load_all_info_from_dataset_hub():
40
  if csv_results is None:
41
  passed = False
42
  if not passed:
43
- print("No HuggingFace token provided. Skipping evaluation requests and results.")
44
 
45
  return eval_queue_repo, requested_models, csv_results
46
 
 
14
 
15
  def load_all_info_from_dataset_hub():
16
  eval_queue_repo = None
 
17
  requested_models = None
18
 
19
  passed = True
 
39
  if csv_results is None:
40
  passed = False
41
  if not passed:
42
+ raise ValueError("No Hugging Face token provided. Skipping evaluation requests and results.")
43
 
44
  return eval_queue_repo, requested_models, csv_results
45
 
utils_display.py CHANGED
@@ -14,7 +14,7 @@ def fields(raw_class):
14
  class AutoEvalColumn: # Auto evals column
15
  model = ColumnContent("Model", "markdown")
16
  avg_wer = ColumnContent("Average WER ⬇️", "number")
17
- rtf = ColumnContent("RTF (1e-3) ⬇️", "number")
18
  ami_wer = ColumnContent("AMI", "number")
19
  e22_wer = ColumnContent("Earnings22", "number")
20
  gs_wer = ColumnContent("Gigaspeech", "number")
@@ -23,7 +23,6 @@ class AutoEvalColumn: # Auto evals column
23
  ss_wer = ColumnContent("SPGISpeech", "number")
24
  tl_wer = ColumnContent("Tedlium", "number")
25
  vp_wer = ColumnContent("Voxpopuli", "number")
26
- cv_wer = ColumnContent("Common Voice", "number")
27
 
28
 
29
  def make_clickable_model(model_name):
 
14
  class AutoEvalColumn: # Auto evals column
15
  model = ColumnContent("Model", "markdown")
16
  avg_wer = ColumnContent("Average WER ⬇️", "number")
17
+ rtf = ColumnContent("RTFx ⬆️️", "number")
18
  ami_wer = ColumnContent("AMI", "number")
19
  e22_wer = ColumnContent("Earnings22", "number")
20
  gs_wer = ColumnContent("Gigaspeech", "number")
 
23
  ss_wer = ColumnContent("SPGISpeech", "number")
24
  tl_wer = ColumnContent("Tedlium", "number")
25
  vp_wer = ColumnContent("Voxpopuli", "number")
 
26
 
27
 
28
  def make_clickable_model(model_name):