File size: 20,663 Bytes
0822dde
09820cd
 
 
 
 
 
 
 
0822dde
a6ebd86
 
09820cd
0822dde
 
 
 
ed4d08e
0822dde
 
 
 
09820cd
ed4d08e
a6ebd86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09820cd
0822dde
09820cd
0822dde
09820cd
 
0822dde
76a0314
09820cd
0822dde
 
09820cd
cc8b0e5
09820cd
 
0822dde
 
72657ca
0822dde
 
72657ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
982909b
72657ca
 
 
 
 
 
 
 
 
 
 
 
0822dde
72657ca
 
 
 
 
 
 
 
 
0822dde
a6ebd86
09820cd
a6ebd86
72657ca
a6ebd86
09820cd
 
 
 
72657ca
09820cd
 
 
cc8b0e5
a6ebd86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0822dde
a6ebd86
 
 
 
 
 
0822dde
a6ebd86
 
 
 
 
 
 
 
0822dde
a6ebd86
 
 
 
 
 
 
 
 
0822dde
a6ebd86
 
 
 
 
 
 
 
0822dde
a6ebd86
 
 
 
 
 
0822dde
a6ebd86
 
 
0822dde
09820cd
0822dde
a6ebd86
0822dde
 
 
 
 
a6ebd86
0822dde
 
 
 
2c4a37c
0822dde
 
cc8b0e5
 
 
 
0822dde
cc8b0e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09820cd
 
 
 
a95af80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09820cd
 
 
 
 
76b14d7
 
a95af80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f18faa1
a95af80
 
 
 
 
0822dde
 
 
09820cd
 
a95af80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0822dde
 
09820cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
import os
import re
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
from dotenv import load_dotenv
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError

load_dotenv()

SERVER_URL = os.getenv("SERVER_URL")

# @st.cache_data
def get_data():
    response = requests.get(SERVER_URL)
    data = response.json()
    return data

# @st.cache_data
def get_model_info(df):
    api = HfApi()

    # Initialize new columns for likes and tags
    df['Likes'] = None

    # Iterate through DataFrame rows
    for index, row in df.iterrows():
        model = row['Model'].strip()
        try:
            model_info = api.model_info(repo_id=str(model))
            df.loc[index, 'Likes'] = f"{model_info.likes}🧑"
            # df.loc[index, 'Tags'] = ', '.join(model_info.tags)

        except (RepositoryNotFoundError, RevisionNotFoundError):
            df.loc[index, 'Likes'] = None
            # df.loc[index, 'Tags'] = ''

    return df


# @st.cache_data
def main():
    
    st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide")
    
    title_column, refresh_column = st.columns([.92, 0.08])
    with title_column:
        st.title("Indic LLM Leaderboard (Ξ±)")
        st.markdown("The Indic LLM Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.")
    with refresh_column:
        if st.button("Refresh", type="primary"):
            data = get_data()
    
    Leaderboard_tab, Release_tab,  About_tab ,FAQ_tab, Submit_tab = st.tabs(["πŸ… Leaderboard", "(Ξ±) Release" ,"πŸ“ About" , "❗FAQ","πŸš€ Submit"])
    
    with Leaderboard_tab:
        data = get_data()
        
        
        table_data = []
        all_models = []
        try:
            for item in data:
                model_name = item.get("name")
                language = item.get("language")
                try:
                    ALL = item["result"]["all"]["acc_norm"]
                except KeyError:
                    ALL = None
                try:
                    ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"]
                except KeyError:
                    ARC_Easy = None
                try:
                    ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"]
                except KeyError:
                    ARC_Challenge = None
                try:
                    Hellaswag = item["result"]["Hellaswag"]["acc_norm"]
                except KeyError:
                    Hellaswag = None
                try:
                    Boolq = item["result"]["Boolq"]["acc_norm"]
                except KeyError:
                    Boolq = None
                try:
                    MMLU = item["result"]["MMLU"]["acc_norm"]
                except KeyError:
                    MMLU = None
                try:
                    Translation = item["result"]["Translation"]["acc_norm"]
                except KeyError:
                    Translation = None
                    
                # If you are going through the code and wondering what is happening this code is a mess
                
                all_models.append(model_name)
                table_data.append({
                    "Model": model_name,
                    "Language": language,
                    "Avergae": ALL,
                    "ARC-Easy": ARC_Easy,
                    "ARC-Challenge": ARC_Challenge,
                    "Hellaswag": Hellaswag,
                    "Boolq": Boolq,
                    "MMLU": MMLU,
                    "Translation": Translation,
                })

            df = pd.DataFrame(table_data)
        except:
            columns = ["Model", "Language", "Avergae", "ARC-Easy", "ARC-Challenge", "Hellaswag", "Boolq", "MMLU", "Translation"]
            # Create an empty list to hold the data
            table_data = []
            # Append an empty dictionary with column names as keys to the table_data list
            table_data.append({col: None for col in columns})
            # Create a DataFrame from the table_data list
            df = pd.DataFrame(table_data)
        
        title = st.text_input('Model', placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...")
        
        on = st.checkbox('Sort by Language')
        

        col1, col2 = st.columns(2)
        with col1:
            benchmark_options = st.multiselect(
                'Pick Benchmark',
                ['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Translation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag'])
        with col2:
            language_options = st.multiselect(
                'Pick Languages',
                ['kannada', 'hindi', 'tamil', 'telegu','gujarati','marathi','malayalam',"english"],['kannada', 'hindi', 'tamil', 'telegu','gujarati','marathi','malayalam',"english"])
        if on:
            # Loop through each selected language
            for language in language_options:
                filtered_df = df[df['Language'] == language]
                        # Check if the filtered dataframe is not empty
                if not filtered_df.empty:
                    st.subheader(f"{language.capitalize()[0]}{language[1:]}")
                    filtered_df.reset_index(drop=True, inplace=True)
                    # Display filtered dataframe
                    filtered_df = get_model_info(filtered_df)
                    if title:
                        if ';' in title:
                            model_names = [name.strip() for name in title.split(';')]
                            filtered_df = df[df['Model'].isin(model_names)]
                        else:
                            filtered_df = df[df['Model'].str.contains(title, case=False, na=False)]
                        
                        filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]

                        # Calculate average across selected benchmark columns
                        filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
                        filtered_df.index += 1
                        st.dataframe(filtered_df, use_container_width=True)
                    elif benchmark_options or language_options:
                        filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]

                        # Calculate average across selected benchmark columns
                        filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
                        
                        filtered_df = get_model_info(filtered_df)
                        filtered_df.index += 1
                        st.dataframe(filtered_df, use_container_width=True)
            # st.write('Feature activated!')
        else:

            if title:
                if ';' in title:
                    model_names = [name.strip() for name in title.split(';')]
                    filtered_df = df[df['Model'].isin(model_names)]
                else:
                    filtered_df = df[df['Model'].str.contains(title, case=False, na=False)]
                
                filtered_df = filtered_df[filtered_df['Language'].isin(language_options)]
                filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]

                # Calculate average across selected benchmark columns
                filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
                filtered_df.index += 1
                # Display the filtered DataFrame
                st.dataframe(filtered_df, use_container_width=True)
            elif benchmark_options or language_options:
                filtered_df = df[df['Language'].isin(language_options)]
                filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]

                # Calculate average across selected benchmark columns
                filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
                
                filtered_df = get_model_info(filtered_df)
                filtered_df.index += 1
                st.dataframe(filtered_df, use_container_width=True)


        
        
        # Multiselect for comparing models
        compare_models = st.multiselect(
            'Pick Models to compare them',
            df['Model'].unique()
        )
        # Display DataFrame for selected models and their scores
        if compare_models:
            compare_data = []
            for model in compare_models:
                model_data = df[df['Model'] == model]
                compare_data.append(model_data)
            if compare_data:
                compare_df = pd.concat(compare_data)
                compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) # Calculate average
                compare_df.index += 1
                st.dataframe(compare_df, use_container_width=True)
                
    with Release_tab:
        st.markdown(
    """
    **Date: April 5th, 2024**

the alpha release of the **Indic LLM Leaderboard** and **Indic Eval**.

The Indic LLM Leaderboard is an evolving platform, aiming to streamline evaluations for Language Model (LLM) models tailored to Indic languages. While this **alpha release is far from perfect**, it signifies a crucial initial step towards establishing evaluation standards within the community.

### Features:

As of this release, the following base models have been evaluated in using the different datasets and benchmarks integrated into the platform:

- `meta meta-llama/Llama-2-7b-hf`
- `google/gemma-7b`

Tasks incorporated into the platform:

- `ARC-Easy:{language}`
- `ARC-Challenge:{language}`
- `Hellaswag:{language}`

For evaluation purposes, each task includes 5-shot prompting. Further experimentation will determine the most optimal balance between evaluation time and accuracy.

### Datasets:

Datasets utilized for evaluation are accessible via the following link: [Indic LLM Leaderboard Eval Suite](https://huggingface.co./collections/Cognitive-Lab/indic-llm-leaderboard-eval-suite-660ac4818695a785edee4e6f)

### Rationale for Alpha Release:

The decision to label this release as alpha stems from the realization that extensive testing and experimentation are necessary. Key considerations include:

- Selection of appropriate metrics for evaluation
- Determination of the optimal few-shot learning parameters
- Establishment of the ideal number of evaluation samples within the dataset

### Collaborative Effort:

To foster collaboration and discussion surrounding evaluations, a [WhatsApp group](https://chat.whatsapp.com/CUb6eS50lX2JHX2D4j13d1) is being established.

and we can also connect on Hugging faces discord [indic_llm channel](https://discord.com/channels/879548962464493619/1189605147068858408)

### Roadmap for Next Release:

Anticipate the following enhancements in the upcoming release:

- Enhanced testing and accountability mechanisms
- A refined version of the leaderboard
- Defined benchmarks and standardized datasets
- Bilingual evaluation support
- Expansion of supported models
- Implementation of more secure interaction mechanisms
- Addition of support for additional languages

### Benchmarks to be added/tested 

- [ ] Boolq
- [ ] MMLU
- [ ] Translation - [IN22-Gen](https://huggingface.co./datasets/ai4bharat/IN22-Gen), [Flores](https://huggingface.co./datasets/facebook/flores)
- [ ] Generation - [ai4bharat/IndicSentiment](https://huggingface.co./datasets/ai4bharat/IndicSentiment), etc..

Upcoming Implementations

- [ ] Support to add VLLM for faster evaluation and inference
- [ ] Add support for onboard evaluation just like OpenLLM Leaderboard

## Conclusion:

The alpha release of the Indic LLM Leaderboard and Indic Eval signifies a significant milestone in the pursuit of standardized evaluations for Indic language models. We invite contributions and feedback from the community to further enhance and refine these tools.

For more information and updates, visit [Indic LLM Leaderboard](https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard) and [Indic Eval](https://github.com/adithya-s-k/indic_eval).

Thank you for your interest and support.

    """
        )

    # About tab
    with About_tab:
        st.markdown('''
## **Why a Indic LLM Leaderboard is Required ?**

In recent months, there has been considerable progress in the Indic large language model (LLM) space. Major startups like Sarvam and Krutrim are building LLMs in this area.
Simultaneously, the open-source community is also adapting pretrained models, such as Llama, Mistral, and Gemma, for Indic languages.
Despite the influx of new models, there is a lack of a unified method to evaluate and compare them. This makes it challenging to track progress and determine what is working and what is not.

> This is the alpha release of the Indic LLM Leaderboard, and modifications will be made to the leaderboard in the future.
> 

## **Who We Are**

I'm [Adithya S K](https://linktr.ee/adithyaskolavi), the founder of [CognitiveLab](https://www.cognitivelab.in/). We provide AI solutions at scale and undertake research-based tasks.

One initiative we have taken is to create a unified platform where Indic LLMs can be compared using specially crafted datasets. Although initially developed for internal use, we are now open-sourcing this framework to further aid the Indic LLM ecosystem.

After releasing [Amabri, a 7b parameter English-Kannada bilingual LLM](https://www.cognitivelab.in/blog/introducing-ambari), we wanted to compare it with other open-source LLMs to identify areas for improvement. As there wasn't an existing solution, we built the Indic LLM suite, which consists of three projects:

- [Indic-llm](https://github.com/adithya-s-k/Indic-llm): An open-source framework designed to adapt pretrained LLMs, such as Llama, Mistral, and Mixtral, to a wide array of domains and languages.
- [Indic-Eval](https://github.com/adithya-s-k/indic_eval): A lightweight evaluation suite tailored specifically for assessing Indic LLMs across a diverse range of tasks, aiding in performance assessment and comparison within the Indian language context.
- [Indic LLM Leaderboard](https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard): Utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework, incorporating state-of-the-art translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting seven Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.

**Contribute**

All the projects are completely open source with different licenses, so anyone can contribute.

The current leaderboard is in alpha release, and many more changes are forthcoming:

- More robust benchmarks tailored for Indic languages.
- Easier integration with [indic_eval](https://github.com/adithya-s-k/indic_eval).
        ''')
        
    # FAQ tab
    with FAQ_tab:
        st.markdown('''
Boolq , MMLU , Translation is still being tested                    

**What is the minimum requirement for GPUs to run the evaluation?**

- The evaluation can easily run on a single A100 GPU, but the framework also supports multi-GPU based evaluation to speed up the process.

**What languages are supported by the evaluation framework?**

- The following languages are supported by default: English, Kannada, Hindi, Tamil, Telugu, Gujarati, Marathi, Malayalam.

**How can I put my model on the leaderboard?**

- Please follow the steps shown in the Submit tab or refer to the indic_eval for more details.

**How does the leaderboard work?**

- After running indic_eval on the model of your choice, the results are pushed to a server and stored in a database. The Frontend Leaderboard accesses the server and retrieves the latest models in the database along with their respective benchmarks and metadata. The entire system is deployed in India and is as secure as possible.

**How is it different from the Open LLM leaderboard?**

- This project was mainly inspired by the Open LLM leaderboard. However, due to limited computation resources, we standardized the evaluation library with standard benchmarks. You can run the evaluation on your GPUs and the leaderboard will serve as a unified platform to compare models. We used indictrans2 and other translation APIs to translate the benchmarking dataset into seven Indian languages to ensure reliability and consistency in the output.

**Why does it take so much time to load the results?**

- We are running the server on a serverless instance which has a cold start problem, so it might sometimes take a while.

**What benchmarks are offered?**

- The current Indic Benchmarks offered by the indic_eval library can be found in this collection: https://huggingface.co./collections/Cognitive-Lab/indic-llm-leaderboard-eval-suite-660ac4818695a785edee4e6f. They include ARC Easy, ARC Challenge, Hellaswag, Boolq, and MMLU.

**How much time does it take to run the evaluation using indic_eval?**

- Depending on which GPU you are running, the time for evaluation varies.
- From our testing, it takes 7 to 8 hours to run the whole evaluation on a single A100 GPU.
- It's much faster when using multiple GPUs.

**How does the verification step happen?**

- While running the evaluation, you are given an option to push results to the leaderboard with `-push_to_leaderboard <[email protected]>`. You will need to provide an email address through which we can contact you. If we find any anomaly in the evaluation score, we will contact you through this email for verification of results.
        ''')

    # Submit tab
    with Submit_tab:
        st.markdown('''
Here are the steps you will have to follows to put your model on the Indic LLM leaderboard 

Clone the repo:

```bash
git clone <https://github.com/adithya-s-k/indic_eval>
cd indic_eval

```

Create a virtual environment using virtualenv or conda depending on your preferences. We require Python 3.10 or above:

```bash
conda create -n indic-eval-venv python=3.10 && conda activate indic-eval-venv

```

Install the dependencies. For the default installation, you just need:

```bash
pip install .

```

If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`):

```bash
pip install '.[optional1,optional2]'

```

The setup tested most is:

```bash
pip install '.[accelerate,quantization,adapters]'

```

If you want to push your results to the Hugging Face Hub, don't forget to add your access token to the environment variable `HUGGING_FACE_HUB_TOKEN`. You can do this by running:

```
huggingface-cli login
```

## Command to Run Indic Eval and Push to Indic LLM Leaderboard

```bash
accelerate launch run_indic_evals_accelerate.py \\
    --model_args="pretrained=<path to model on the hub>" \\
    --tasks indic_llm_leaderboard \\
    --output_dir output_dir \\
    --push_to_leaderboard <[email protected]> \\

```

It's as simple as that.πŸ‘

For `--push_to_leaderboard`, provide an email id through which we can contact you in case of verification. This email won't be shared anywhere. It's only required for future verification of the model's scores and for authenticity.

After you have installed all the required packages, run the following command:

For multi-GPU configuration, please refer to the docs of [Indic_Eval](https://github.com/adithya-s-k/indic_eval).
        ''')

        
    with st.expander(label="πŸ“™ Citation"):
        code = '''
                    @misc{indic-llm-leaderboard,
            author = {Adithya S Kolavi},
            title = {Indic LLM Leaderboard},
            year = {2024},
            publisher = {Cognitivelab},
            howpublished = "url{https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard}",
            }
        '''
        st.code(code, language='python')
        
if __name__ == "__main__":
    main()