File size: 6,669 Bytes
ef32dc8
 
f01918a
d030923
f01918a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d030923
f01918a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef32dc8
 
 
f01918a
 
 
 
 
 
 
 
ef32dc8
f01918a
 
ef32dc8
f01918a
ef32dc8
f01918a
 
 
 
 
 
 
ef32dc8
 
 
f01918a
 
 
ef32dc8
 
 
 
f01918a
d030923
 
f01918a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import pandas as pd

banner_url = "/file/banner.png"
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 20vw; min-width: 300px; max-width: 600px;"> </div>'

INTRODUCTION_TEXT = """
πŸ“–**Open Universal Arabic ASR Leaderboard**πŸ“– benchmarks multi-dialect Arabic ASR models on various multi-dialect datasets.
\nApart from the WER%/CER% for each test set, we also report the Average WER%/CER% and rank the models based on the Average WER, from lowest to highest.
\nTo reproduce the benchmark numbers and request a model that is not listed, you can launch an issue/PR in our GitHub repo😊.
\nFor more detailed analysis such as models' robustness, speaker adaption, model efficiency and memory usage, please check our paper.
"""

CITATION_BUTTON_TEXT = """@misc{???,
    title        = {???},
    author       = {???},
    year         = ???,
    publisher    = {???},
    howpublished = "???"
}
"""

METRICS_TAB_TEXT = METRICS_TAB_TEXT = """
## Metrics
We report both the Word Error Rate (WER) and Character Error Rate (CER) metrics.
## Reproduction
The Open Universal Arabic ASR Leaderboard will be a continuous benchmark project. 
\nWe open-source the the evaluation scripts at our GitHub repo.
\nPlease launch a discussion in our GitHub repo to let us know if you want to learn about the performance of a new model.

## Benchmark datasets
| Test Set                                                                                        | Num Dialects   | Test (h)    |
|-------------------------------------------------------------------------------------------------|----------------|-------------|
| [SADA](https://www.kaggle.com/datasets/sdaiancai/sada2022)                                      | 10             | 10.7        |
| [Common Voice 18.0](https://commonvoice.mozilla.org/en/datasets)                                | 25             | 12.6        |
| [MASC (Clean-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus)    | 7              | 10.5        |
| [MASC (Noisy-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus)    | 8              | 14.9        |
| [MGB-2](http://www.mgb-challenge.org/MGB-2.html)                                                | Unspecified    | 9.6         |

## In-depth Analysis
We also provide a comprehensive analysis of models' robustness, speaker adaptation, inference efficiency and memory consumption.
\nPlease check our paper to learn more.
"""


def styled_message(message):
    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"

LAST_UPDATED = "NOV 6th 2024"


results = {
    "Model": ["nvidia-conformer-large-arabic (lm)", "nvidia-conformer-large-arabic (greedy)", "openai/whisper-large-v3", "facebook/seamless-m4t-v2-large", "openai/whisper-large-v3-turbo", "openai/whisper-large-v2", "openai/whisper-large", "asafaya/hubert-large-arabic-transcribe/", "openai/whisper-medium", "facebook/mms-1b-all", "openai/whisper-small", "whitefox123/w2v-bert-2.0-arabic-4", "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "speechbrain/asr-wav2vec2-commonvoice-14-ar"],
    "Average WER⬇️": [25.71, 27.46, 29.87, 32.55, 33.30, 34.04, 36.65, 39.29, 39.60, 47.86, 48.62, 52.18, 54.63, 60.15],
    "Average CER": [10.02, 9.94, 13.65, 14.47, 15.68, 16.26, 17.44, 13.61, 19.10, 17.66, 16.79, 25.15, 21.46, 26.64],
    "SADA WER": [44.52, 47.26, 55.96, 62.52, 60.36, 57.46, 63.24, 67.82, 67.71, 77.48, 78.02, 87.34, 86.82, 88.54],
    "SADA CER": [23.76, 22.54, 34.62, 37.61, 37.67, 36.59, 40.16, 31.83, 43.83, 37.50, 33.17, 56.75, 44.20, 50.28],
    "Common Voice WER": [8.80, 10.60, 17.83, 21.70, 25.73, 21.77, 26.04, 8.01, 28.07, 26.52, 24.18, 41.79, 23.00, 29.17],
    "Common Voice CER": [2.77, 3.05, 5.74, 6.24, 10.89, 7.44, 9.61, 2.37, 10.38, 7.21, 6.79, 15.75, 6.64, 9.85],
    "MASC(clean-test) WER": [23.74, 24.12, 24.66, 25.04, 25.51, 27.25, 28.89, 32.94, 29.99, 38.82, 35.93, 37.82, 42.75, 49.10],
    "MASC(clean-test) CER": [5.63, 5.63, 7.24, 7.19, 7.55, 8.28, 9.05, 7.15, 8.98, 10.36, 9.01, 11.92, 11.87, 16.37],
    "MASC(noisy-test) WER": [34.29, 35.64, 34.63, 33.24, 37.16, 38.55, 40.79, 50.16, 42.91, 57.33, 56.36, 53.28, 64.27, 69.57],
    "MASC(noisy-test) CER": [11.07, 11.02, 12.89, 11.92, 13.93, 15.49, 16.31, 15.62, 17.49, 19.76, 19.43, 21.93, 24.17, 30.17],
    "MGB-2 WER": [17.20, 19.69, 16.26, 20.23, 17.75, 25.17, 24.28, 37.51, 29.32, 39.16, 48.64, 40.66, 56.29, 64.37],
    "MGB-2 CER": [6.87, 7.46, 7.74, 9.37, 8.34, 13.48, 12.10, 11.07, 14.82, 13.48, 15.56, 19.39, 20.44, 26.56],
}

original_df = pd.DataFrame(results)
original_df.sort_values(by="Average WER⬇️", inplace=True)

TYPES = ['str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']


def request_model(model_text):
    return styled_message("πŸ€— Please launch a discussion in our GitHub repo, thank you. πŸ€—")

with gr.Blocks() as demo:
    gr.HTML(BANNER, elem_id="banner")
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("πŸ… Leaderboard", elem_id="od-benchmark-tab-table", id=0):
            leaderboard_table = gr.Dataframe(
                value=original_df,
                datatype=TYPES,
                elem_id="leaderboard-table",
                interactive=False,
                visible=True,
            )

        with gr.TabItem("πŸ“ˆ Metrics", elem_id="od-benchmark-tab-table", id=1):
            gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")

        with gr.TabItem("βœ‰οΈβœ¨ Request a model here!", elem_id="od-benchmark-tab-table", id=2):
            with gr.Column():
                gr.Markdown("# βœ‰οΈβœ¨ Request results for a new model here!", elem_classes="markdown-text")
                model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)")
                mdw_submission_result = gr.Markdown()
                btn_submit = gr.Button(value="πŸš€ Request")
                btn_submit.click(request_model, [model_name_textbox], mdw_submission_result)

    gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("πŸ“™ Citation", open=False):
            gr.Textbox(
                value=CITATION_BUTTON_TEXT, lines=7,
                label="Copy the BibTeX snippet to cite this source",
                elem_id="citation-button",
                show_copy_button=True,
            )

demo.launch(
    allowed_paths=["/"],
    # share=True,
)