Spaces:

Cognitive-Lab
/

indic_llm_leaderboard

Running

File size: 17,030 Bytes

import os
import re
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
from dotenv import load_dotenv
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError

load_dotenv()

SERVER_URL = os.getenv("SERVER_URL")

@st.cache_data
def get_data():
    response = requests.get(SERVER_URL)
    data = response.json()
    return data

@st.cache_data
def get_model_info(df):
    api = HfApi()

    # Initialize new columns for likes and tags
    df['Likes'] = None

    # Iterate through DataFrame rows
    for index, row in df.iterrows():
        model = row['Model'].strip()
        try:
            model_info = api.model_info(repo_id=str(model))
            df.loc[index, 'Likes'] = f"{model_info.likes}🧡"
            # df.loc[index, 'Tags'] = ', '.join(model_info.tags)

        except (RepositoryNotFoundError, RevisionNotFoundError):
            df.loc[index, 'Likes'] = None
            # df.loc[index, 'Tags'] = ''

    return df


# @st.cache_data
def main():
    
    st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide")
    
    title_column, refresh_column = st.columns([.92, 0.08])
    with title_column:
        st.title("Indic LLM Leaderboard (α)")
        st.markdown("The Indic Eval Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.")
    with refresh_column:
        if st.button("Refresh", type="primary"):
            data = get_data()
    
    Leaderboard_tab, About_tab ,FAQ_tab, Submit_tab = st.tabs(["🏅 Leaderboard", "📝 About" , "❗FAQ","🚀 Submit"])
    
    with Leaderboard_tab:
        data = get_data()
        
        table_data = []
        all_models = []
        
        for item in data:
            model_name = item.get("name")
            language = item.get("language")
            try:
                ALL = item["result"]["all"]["acc_norm"]
            except KeyError:
                ALL = None
            try:
                ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"]
            except KeyError:
                ARC_Easy = None
            try:
                ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"]
            except KeyError:
                ARC_Challenge = None
            try:
                Hellaswag = item["result"]["Hellaswag"]["acc_norm"]
            except KeyError:
                Hellaswag = None
            try:
                Boolq = item["result"]["Boolq"]["acc_norm"]
            except KeyError:
                Boolq = None
            try:
                MMLU = item["result"]["MMLU"]["acc_norm"]
            except KeyError:
                MMLU = None
            try:
                Translation = item["result"]["Translation"]["acc_norm"]
            except KeyError:
                Translation = None
            try:
                Generation = item["result"]["Generation"]["acc_norm"]
            except KeyError:
                Generation = None
                
            # If you are going through the code and wondering what is happening this code is a mess
            
            all_models.append(model_name)
            table_data.append({
                "Model": model_name,
                "Language": language,
                "Avergae": ALL,
                "ARC-Easy": ARC_Easy,
                "ARC-Challenge": ARC_Challenge,
                "Hellaswag": Hellaswag,
                "Boolq": Boolq,
                "MMLU": MMLU,
                "Translation": Translation,
                "Generation": Generation
            })

        df = pd.DataFrame(table_data)
        
        title = st.text_input('Model', placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...")
        
        on = st.checkbox('Sort by Language')

        col1, col2 = st.columns(2)
        with col1:
            benchmark_options = st.multiselect(
                'Pick Benchmark',
                ['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Translation','Generation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU'])
        with col2:
            language_options = st.multiselect(
                'Pick Languages',
                ['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'],['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'])
        if on:
            # Loop through each selected language
            for language in language_options:
                filtered_df = df[df['Language'] == language]
                        # Check if the filtered dataframe is not empty
                if not filtered_df.empty:
                    st.subheader(f"{language.capitalize()[0]}{language[1:]}")
                    filtered_df.reset_index(drop=True, inplace=True)
                    # Display filtered dataframe
                    filtered_df = get_model_info(filtered_df)
                    if title:
                        if ';' in title:
                            model_names = [name.strip() for name in title.split(';')]
                            filtered_df = df[df['Model'].isin(model_names)]
                        else:
                            filtered_df = df[df['Model'].str.contains(title, case=False, na=False)]
                        
                        filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]

                        # Calculate average across selected benchmark columns
                        filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
                        filtered_df.index += 1
                        st.dataframe(filtered_df, use_container_width=True)
                    elif benchmark_options or language_options:
                        filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]

                        # Calculate average across selected benchmark columns
                        filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
                        
                        filtered_df = get_model_info(filtered_df)
                        filtered_df.index += 1
                        st.dataframe(filtered_df, use_container_width=True)
            # st.write('Feature activated!')
        else:

            if title:
                if ';' in title:
                    model_names = [name.strip() for name in title.split(';')]
                    filtered_df = df[df['Model'].isin(model_names)]
                else:
                    filtered_df = df[df['Model'].str.contains(title, case=False, na=False)]
                
                filtered_df = filtered_df[filtered_df['Language'].isin(language_options)]
                filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]

                # Calculate average across selected benchmark columns
                filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
                filtered_df.index += 1
                # Display the filtered DataFrame
                st.dataframe(filtered_df, use_container_width=True)
            elif benchmark_options or language_options:
                filtered_df = df[df['Language'].isin(language_options)]
                filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]

                # Calculate average across selected benchmark columns
                filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
                
                filtered_df = get_model_info(filtered_df)
                filtered_df.index += 1
                st.dataframe(filtered_df, use_container_width=True)


        
        
        # Multiselect for comparing models
        compare_models = st.multiselect(
            'Pick Models to compare them',
            df['Model'].unique()
        )
        # Display DataFrame for selected models and their scores
        if compare_models:
            compare_data = []
            for model in compare_models:
                model_data = df[df['Model'] == model]
                compare_data.append(model_data)
            if compare_data:
                compare_df = pd.concat(compare_data)
                compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) # Calculate average
                compare_df.index += 1
                st.dataframe(compare_df, use_container_width=True)
                


    # About tab
    with About_tab:
        st.markdown('''
## **Why a Indic LLM Leaderboard is Required ?**

In recent months, there has been considerable progress in the Indic large language model (LLM) space. Major startups like Sarvam and Krutrim are building LLMs in this area.
Simultaneously, the open-source community is also adapting pretrained models, such as Llama, Mistral, and Gemma, for Indic languages.
Despite the influx of new models, there is a lack of a unified method to evaluate and compare them. This makes it challenging to track progress and determine what is working and what is not.

> This is the alpha release of the Indic LLM Leaderboard, and modifications will be made to the leaderboard in the future.
> 

## **Who We Are**

I'm [Adithya S K](https://linktr.ee/adithyaskolavi), the founder of [CognitiveLab](https://www.cognitivelab.in/). We provide AI solutions at scale and undertake research-based tasks.

One initiative we have taken is to create a unified platform where Indic LLMs can be compared using specially crafted datasets. Although initially developed for internal use, we are now open-sourcing this framework to further aid the Indic LLM ecosystem.

After releasing [Amabri, a 7b parameter English-Kannada bilingual LLM](https://www.cognitivelab.in/blog/introducing-ambari), we wanted to compare it with other open-source LLMs to identify areas for improvement. As there wasn't an existing solution, we built the Indic LLM suite, which consists of three projects:

- [Indic-llm](https://github.com/adithya-s-k/Indic-llm): An open-source framework designed to adapt pretrained LLMs, such as Llama, Mistral, and Mixtral, to a wide array of domains and languages.
- [Indic-Eval](https://github.com/adithya-s-k/indic_eval): A lightweight evaluation suite tailored specifically for assessing Indic LLMs across a diverse range of tasks, aiding in performance assessment and comparison within the Indian language context.
- [Indic LLM Leaderboard](https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard): Utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework, incorporating state-of-the-art translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting seven Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.

## **Upcoming implementations**

- [ ] Support to add VLLM for faster evaluation and inference
- [ ] SkyPilot installation to quickly run indic_eval on any cloud provider
- [ ] Add support for onboard evaluation just like OpenLLM Leaderboard

**Contribute**

All the projects are completely open source with different licenses, so anyone can contribute.

The current leaderboard is in alpha release, and many more changes are forthcoming:

- More robust benchmarks tailored for Indic languages.
- Easier integration with [indic_eval](https://github.com/adithya-s-k/indic_eval).
        ''')
        
    # FAQ tab
    with FAQ_tab:
        st.markdown('''
**What is the minimum requirement for GPUs to run the evaluation?**

- The evaluation can easily run on a single A100 GPU, but the framework also supports multi-GPU based evaluation to speed up the process.

**What languages are supported by the evaluation framework?**

- The following languages are supported by default: English, Kannada, Hindi, Tamil, Telugu, Gujarati, Marathi, Malayalam.

**How can I put my model on the leaderboard?**

- Please follow the steps shown in the Submit tab or refer to the indic_eval for more details.

**How does the leaderboard work?**

- After running indic_eval on the model of your choice, the results are pushed to a server and stored in a database. The Frontend Leaderboard accesses the server and retrieves the latest models in the database along with their respective benchmarks and metadata. The entire system is deployed in India and is as secure as possible.

**How is it different from the Open LLM leaderboard?**

- This project was mainly inspired by the Open LLM leaderboard. However, due to limited computation resources, we standardized the evaluation library with standard benchmarks. You can run the evaluation on your GPUs and the leaderboard will serve as a unified platform to compare models. We used indictrans2 and other translation APIs to translate the benchmarking dataset into seven Indian languages to ensure reliability and consistency in the output.

**Why does it take so much time to load the results?**

- We are running the server on a serverless instance which has a cold start problem, so it might sometimes take a while.

**What benchmarks are offered?**

- The current Indic Benchmarks offered by the indic_eval library can be found in this collection: https://huggingface.co./collections/Cognitive-Lab/indic-llm-leaderboard-eval-suite-660ac4818695a785edee4e6f. They include ARC Easy, ARC Challenge, Hellaswag, Boolq, and MMLU.

**How much time does it take to run the evaluation using indic_eval?**

- Depending on which GPU you are running, the time for evaluation varies.
- From our testing, it takes 3 to 4 hours to run the whole evaluation on a single GPU.
- It's much faster when using multiple GPUs.

**How does the verification step happen?**

- While running the evaluation, you are given an option to push results to the leaderboard with `-push_to_leaderboard <[email protected]>`. You will need to provide an email address through which we can contact you. If we find any anomaly in the evaluation score, we will contact you through this email for verification of results.
        ''')

    # Submit tab
    with Submit_tab:
        st.markdown('''
Here are the steps you will have to follows to put your model on the Indic LLM leaderboard 

Clone the repo:

```bash
git clone <https://github.com/adithya-s-k/indic_eval>
cd indic_eval

```

Create a virtual environment using virtualenv or conda depending on your preferences. We require Python 3.10 or above:

```bash
conda create -n indic-eval-venv python=3.10 && conda activate indic-eval-venv

```

Install the dependencies. For the default installation, you just need:

```bash
pip install .

```

If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`):

```bash
pip install '.[optional1,optional2]'

```

The setup tested most is:

```bash
pip install '.[accelerate,quantization,adapters]'

```

If you want to push your results to the Hugging Face Hub, don't forget to add your access token to the environment variable `HUGGING_FACE_HUB_TOKEN`. You can do this by running:

```
huggingface-cli login
```

## Command to Run Indic Eval and Push to Indic LLM Leaderboard

```bash
accelerate launch run_indic_evals_accelerate.py \\
    --model_args="pretrained=<path to model on the hub>" \\
    --tasks indic_llm_leaderboard \\
    --output_dir output_dir \\
    --push_to_leaderboard <[email protected]> \\

```

It's as simple as that.👍

For `--push_to_leaderboard`, provide an email id through which we can contact you in case of verification. This email won't be shared anywhere. It's only required for future verification of the model's scores and for authenticity.

After you have installed all the required packages, run the following command:

For multi-GPU configuration, please refer to the docs of [Indic_Eval](https://github.com/adithya-s-k/indic_eval).
        ''')

        
    with st.expander(label="📙 Citation"):
        code = '''
                    @misc{indic-llm-leaderboard,
            author = {Adithya S Kolavi},
            title = {Indic LLM Leaderboard},
            year = {2024},
            publisher = {Cognitivelab},
            howpublished = "url{https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard}",
            }
        '''
        st.code(code, language='python')
        
if __name__ == "__main__":
    main()