File size: 8,327 Bytes
aa37927
5eeb4d8
aa37927
863e0ef
aa37927
5eeb4d8
aa37927
 
5eeb4d8
aa37927
36c174a
0b82379
 
 
 
 
 
 
 
 
 
 
 
 
aa37927
cee99cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36c174a
0b82379
863e0ef
 
 
46816dc
 
6f9c828
46816dc
 
863e0ef
 
46816dc
863e0ef
df56132
 
7368029
df56132
 
9535ee7
 
df56132
 
863e0ef
 
df56132
863e0ef
af1056f
863e0ef
af1056f
863e0ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af1056f
3e0f1db
af1056f
3e0f1db
af1056f
 
b0c7694
 
 
af1056f
b0c7694
3e0f1db
863e0ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from enum import Enum
from dataclasses import dataclass

# Define TaskInfo and Tasks as before
@dataclass
class TaskInfo:
    benchmark: str
    col_name: str
    metric: str


# src/about.py

from enum import Enum
from dataclasses import dataclass

# Define TaskInfo dataclass
@dataclass
class TaskInfo:
    benchmark: str
    col_name: str
    metric: str

# Define Tasks enum with your specific subjects, excluding the unwanted ones
class Tasks(Enum):
    Professional_Law = TaskInfo(benchmark='professional_law', col_name='Professional Law', metric='accuracy')
    Moral_Scenarios = TaskInfo(benchmark='moral_scenarios', col_name='Moral Scenarios', metric='accuracy')
    Miscellaneous = TaskInfo(benchmark='miscellaneous', col_name='Miscellaneous', metric='accuracy')
    High_School_Psychology = TaskInfo(benchmark='high_school_psychology', col_name='High School Psychology', metric='accuracy')
    High_School_Macroeconomics = TaskInfo(benchmark='high_school_macroeconomics', col_name='High School Macroeconomics', metric='accuracy')
    Elementary_Mathematics = TaskInfo(benchmark='elementary_mathematics', col_name='Elementary Mathematics', metric='accuracy')
    Prehistory = TaskInfo(benchmark='prehistory', col_name='Prehistory', metric='accuracy')
    Philosophy = TaskInfo(benchmark='philosophy', col_name='Philosophy', metric='accuracy')
    High_School_Biology = TaskInfo(benchmark='high_school_biology', col_name='High School Biology', metric='accuracy')
    Nutrition = TaskInfo(benchmark='nutrition', col_name='Nutrition', metric='accuracy')
    Professional_Accounting = TaskInfo(benchmark='professional_accounting', col_name='Professional Accounting', metric='accuracy')
    Professional_Medicine = TaskInfo(benchmark='professional_medicine', col_name='Professional Medicine', metric='accuracy')
    High_School_Mathematics = TaskInfo(benchmark='high_school_mathematics', col_name='High School Mathematics', metric='accuracy')
    Clinical_Knowledge = TaskInfo(benchmark='clinical_knowledge', col_name='Clinical Knowledge', metric='accuracy')
    Security_Studies = TaskInfo(benchmark='security_studies', col_name='Security Studies', metric='accuracy')
    High_School_Microeconomics = TaskInfo(benchmark='high_school_microeconomics', col_name='High School Microeconomics', metric='accuracy')
    High_School_World_History = TaskInfo(benchmark='high_school_world_history', col_name='High School World History', metric='accuracy')
    Conceptual_Physics = TaskInfo(benchmark='conceptual_physics', col_name='Conceptual Physics', metric='accuracy')
    Marketing = TaskInfo(benchmark='marketing', col_name='Marketing', metric='accuracy')
    High_School_Statistics = TaskInfo(benchmark='high_school_statistics', col_name='High School Statistics', metric='accuracy')
    High_School_US_History = TaskInfo(benchmark='high_school_us_history', col_name='High School US History', metric='accuracy')
    High_School_Chemistry = TaskInfo(benchmark='high_school_chemistry', col_name='High School Chemistry', metric='accuracy')
    Sociology = TaskInfo(benchmark='sociology', col_name='Sociology', metric='accuracy')
    High_School_Geography = TaskInfo(benchmark='high_school_geography', col_name='High School Geography', metric='accuracy')
    High_School_Government_and_Politics = TaskInfo(benchmark='high_school_government_and_politics', col_name='High School Government and Politics', metric='accuracy')
    College_Medicine = TaskInfo(benchmark='college_medicine', col_name='College Medicine', metric='accuracy')
    Virology = TaskInfo(benchmark='virology', col_name='Virology', metric='accuracy')
    High_School_European_History = TaskInfo(benchmark='high_school_european_history', col_name='High School European History', metric='accuracy')
    Logical_Fallacies = TaskInfo(benchmark='logical_fallacies', col_name='Logical Fallacies', metric='accuracy')
    Astronomy = TaskInfo(benchmark='astronomy', col_name='Astronomy', metric='accuracy')
    High_School_Physics = TaskInfo(benchmark='high_school_physics', col_name='High School Physics', metric='accuracy')
    Electrical_Engineering = TaskInfo(benchmark='electrical_engineering', col_name='Electrical Engineering', metric='accuracy')
    College_Biology = TaskInfo(benchmark='college_biology', col_name='College Biology', metric='accuracy')
    Anatomy = TaskInfo(benchmark='anatomy', col_name='Anatomy', metric='accuracy')
    Formal_Logic = TaskInfo(benchmark='formal_logic', col_name='Formal Logic', metric='accuracy')
    International_Law = TaskInfo(benchmark='international_law', col_name='International Law', metric='accuracy')
    Econometrics = TaskInfo(benchmark='econometrics', col_name='Econometrics', metric='accuracy')
    Machine_Learning = TaskInfo(benchmark='machine_learning', col_name='Machine Learning', metric='accuracy')
    Management = TaskInfo(benchmark='management', col_name='Management', metric='accuracy')
    College_Physics = TaskInfo(benchmark='college_physics', col_name='College Physics', metric='accuracy')
    US_Foreign_Policy = TaskInfo(benchmark='us_foreign_policy', col_name='US Foreign Policy', metric='accuracy')
    Business_Ethics = TaskInfo(benchmark='business_ethics', col_name='Business Ethics', metric='accuracy')
    College_Mathematics = TaskInfo(benchmark='college_mathematics', col_name='College Mathematics', metric='accuracy')
    College_Chemistry = TaskInfo(benchmark='college_chemistry', col_name='College Chemistry', metric='accuracy')
    College_Computer_Science = TaskInfo(benchmark='college_computer_science', col_name='College Computer Science', metric='accuracy')
    High_School_Computer_Science = TaskInfo(benchmark='high_school_computer_science', col_name='High School Computer Science', metric='accuracy')
    Computer_Security = TaskInfo(benchmark='computer_security', col_name='Computer Security', metric='accuracy')
    Global_Facts = TaskInfo(benchmark='global_facts', col_name='Global Facts', metric='accuracy')
    Medical_Genetics = TaskInfo(benchmark='medical_genetics', col_name='Medical Genetics', metric='accuracy')
    Abstract_Algebra = TaskInfo(benchmark='abstract_algebra', col_name='Abstract Algebra', metric='accuracy')


# Now include the variables expected by app.py

TITLE = """
<div align="center">
    <a href="https://imgbb.com/">
        <img src="https://i.ibb.co/k1gQsTw/Blue-and-White-Modern-Technology-Company-Logo-2.png" alt="Blue-and-White-Modern-Technology-Company-Logo-2" border="0" width="500" height="auto">
    </a>
</div>
"""


INTRODUCTION_TEXT = """
<div style="background-color:#001f3f; padding: 20px; border-radius: 10px;">
    <h1 style="color:#ffffff; font-family: Arial, sans-serif; text-align: center;">
        Welcome to <span style="color:#f39c12;">ILMAAM</span>: Index for Language Models for Arabic Assessment on Multitasks!
    </h1>
    <p style="color:#d4d4d4; font-family: 'Verdana', sans-serif; font-size: 18px; text-align: center;">
        This leaderboard showcases the performance of various Arabic LLMs on the 
       <strong style="color:#d4d4d4;">newly released MMMLU OpenAI Benchmark</strong> across different subjects.
    </p>
</div>
"""


LLM_BENCHMARKS_TEXT = """
## About ILMAAM

ILMAAM is based on The Massive Multitask Multilingual Language Understanding benchmark which is designed to evaluate Arabic models on a wide range of subjects.

## How to Interpret the Leaderboard

- **Model**: The name of the model evaluated.
- **Average ⬆️**: The average accuracy across all subjects.
- **Subject Columns**: The accuracy (%) for each individual subject.

## How to Submit Your Model

Go to the **Submit here!** tab and provide your model details to have it evaluated and appear on the leaderboard.
"""

EVALUATION_QUEUE_TEXT = """
Below are the lists of models that have been evaluated, are currently being evaluated, or are pending evaluation.
"""

CITATION_BUTTON_LABEL = "Citation"
CITATION_BUTTON_TEXT = """
If you use this leaderboard or the MMMLU dataset in your research, please cite:
@misc{ILMAAM,
  author = {Nacar, Omer},
  title = {ILMAAM: Index for Language Models For Arabic Assessment on Multitasks},
  year = {2024},
  publisher = {Robotics and Internet-of-Things Lab, Prince Sultan University, Riyadh}"


Acknowledgment:

Thanks for Prince Sultan University and RIOTU Lab for their support.

}"""