Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gradio as gr | |
from utils import submit_gradio_module, load_retrieval_results, load_reranking_results | |
from fuzzywuzzy import fuzz | |
HEADER = """<div style="text-align: center; margin-bottom: 20px;"> | |
<h1>The Arabic RAG Leaderboard</h1> | |
<p style="font-size: 16px; color: #888;">The only leaderboard you will require for your RAG needs π</p> | |
</div> | |
This leaderboard presents the first comprehensive benchmark for Arabic RAG systems, evaluating both retrieval and re-ranking components. Our framework combines real-world queries with synthetic contexts in a dynamic evaluation cycle, ensuring fair and robust assessment of Arabic information retrieval systems. | |
<br> | |
<br> | |
For technical details, check our blog post <a href="https://huggingface.co./blog/Navid-AI/arabic-rag-leaderboard">here</a>. | |
""" | |
RETRIEVAL_ABOUT_SECTION = """ | |
## About Retrieval Evaluation | |
The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on: | |
### Web Search Dataset Metrics | |
- **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result | |
- **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results | |
- **Recall@5**: Measures the proportion of relevant documents found in the top 5 results | |
- **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5 | |
### Model Requirements | |
- Must support Arabic text embeddings | |
- Should handle queries of at least 512 tokens | |
- Must work with `sentence-transformers` library | |
### Evaluation Process | |
1. Models process Arabic web search queries | |
2. Retrieved documents are evaluated using: | |
- MRR for first relevant result positioning | |
- nDCG for overall ranking quality | |
- Recall@5 for top results accuracy | |
3. Metrics are averaged to calculate the overall score | |
4. Models are ranked based on their overall performance | |
### How to Prepare Your Model | |
- Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet) | |
- Model should output fixed-dimension embeddings for text | |
- Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`) | |
""" | |
RERANKER_ABOUT_SECTION = """ | |
## About Reranking Evaluation | |
The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance. | |
### Evaluation Metrics | |
- **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10 | |
- **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10 | |
- **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents | |
All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance. | |
### Model Requirements | |
- Must accept query-document pairs as input | |
- Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching) | |
- Support for Arabic text processing | |
### Evaluation Process | |
1. Models are tested on multiple unseen Arabic datasets | |
2. For each dataset: | |
- Initial candidate documents are provided | |
- Model reranks the candidates | |
- MRR@10, NDCG@10, and MAP are calculated | |
3. Final scores are averaged across all datasets | |
4. Models are ranked based on overall performance | |
### How to Prepare Your Model | |
- Model should be public on HuggingFace Hub (private models are not supported yet) | |
- Make sure it works coherently with `sentence-transformers` library | |
""" | |
CITATION_BUTTON_LABEL = """ | |
Copy the following snippet to cite these results | |
""" | |
CITATION_BUTTON_TEXT = """ | |
@misc{TARL, | |
author = {Mohaned A. Rashad, Hamza Shahid}, | |
title = {The Arabic RAG Leaderboard}, | |
year = {2025}, | |
publisher = {Navid-AI}, | |
howpublished = "url{https://huggingface.co./spaces/Navid-AI/The-Arabic-Rag-Leaderboard}" | |
} | |
""" | |
retrieval_df = None | |
reranking_df = None | |
def search_leaderboard(df, model_name, columns_to_show, threshold=95): | |
if not model_name.strip(): | |
return df.loc[:, columns_to_show] | |
search_name = model_name.lower() # compute once for efficiency | |
def calculate_similarity(row): | |
return fuzz.partial_ratio(search_name, row["Model"].lower()) | |
filtered_df = df.copy() | |
filtered_df["similarity"] = filtered_df.apply(calculate_similarity, axis=1) | |
filtered_df = filtered_df[filtered_df["similarity"] >= threshold].sort_values('similarity', ascending=False) | |
filtered_df = filtered_df.drop('similarity', axis=1).loc[:, columns_to_show] | |
return filtered_df | |
def retrieval_search_leaderboard(model_name, columns_to_show): | |
return search_leaderboard(retrieval_df, model_name, columns_to_show) | |
def reranking_search_leaderboard(model_name, columns_to_show): | |
return search_leaderboard(reranking_df, model_name, columns_to_show) | |
def update_retrieval_columns_to_show(columns_to_show): | |
global retrieval_df | |
dummy_df = retrieval_df.loc[:, [col for col in retrieval_df.columns if col in columns_to_show]] | |
columns_widths = [] | |
for col in dummy_df.columns: | |
if col == "Rank": | |
columns_widths.append(80) | |
elif col == "Model": | |
columns_widths.append(400) | |
else: | |
columns_widths.append(150) | |
return gr.update(value=dummy_df, column_widths=columns_widths) | |
def update_reranker_columns_to_show(columns_to_show): | |
global reranking_df | |
dummy_df = reranking_df.loc[:, [col for col in reranking_df.columns if col in columns_to_show]] | |
columns_widths = [] | |
for col in dummy_df.columns: | |
if col == "Rank": | |
columns_widths.append(80) | |
elif col == "Model": | |
columns_widths.append(400) | |
else: | |
columns_widths.append(150) | |
return gr.update(value=dummy_df, column_widths=columns_widths) | |
def main(): | |
global retrieval_df, reranking_df | |
# Prepare retrieval dataframe | |
retrieval_df = load_retrieval_results(True, "Web Search Dataset (Overall Score)", ["Revision", "Precision", "Task"]) | |
retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df))) | |
retrieval_df = retrieval_df[['Rank', 'Model', 'Web Search Dataset (Overall Score)', 'Model Size (MB)', 'Embedding Dimension', 'Max Tokens', 'Num Likes', 'Downloads Last Month', 'Web Search Dataset (MRR)', 'Web Search Dataset (nDCG@k=None)', 'Web Search Dataset (Recall@5)', 'License']] | |
retrieval_columns_to_show = ["Rank", "Model", "Web Search Dataset (Overall Score)", "Model Size (MB)", "Embedding Dimension", "Max Tokens", "Num Likes"] | |
retrieval_columns_widths = [80, 400, 150, 150, 150, 150, 150] | |
retrieval_cols = retrieval_df.columns.tolist() # cache columns | |
# Prepare reranking dataframe | |
reranking_df = load_reranking_results(True, sort_col="Overall Score", drop_cols=["Revision", "Precision", "Task"]) | |
reranking_df.insert(0, "Rank", range(1, 1 + len(reranking_df))) | |
reranking_df.rename(columns={"nDCG": "nDCG@10", "MRR": "MRR@10"}, inplace=True) | |
reranking_columns_to_show = ["Rank", "Model", "Overall Score", "Model Parameters (in Millions)", "Embedding Dimensions", "Downloads Last Month", "MRR@10", "nDCG@10", "MAP"] | |
reranking_columns_widths = [80, 400, 150, 150, 150, 150, 150, 150, 150] | |
reranking_cols = reranking_df.columns.tolist() # cache columns | |
with gr.Blocks() as demo: | |
gr.HTML(HEADER) | |
with gr.Tabs(): | |
with gr.Tab("π΅οΈββοΈ Retrieval"): | |
with gr.Tabs(): | |
with gr.Tab("π Leaderboard"): | |
with gr.Row(): | |
search_box_retrieval = gr.Textbox( | |
placeholder="Search for models...", | |
label="Search", | |
scale=5 | |
) | |
retrieval_columns_to_show_input = gr.CheckboxGroup( | |
label="Columns to Show", | |
choices=retrieval_cols, # use cached list | |
value=retrieval_columns_to_show, | |
scale=4 | |
) | |
retrieval_leaderboard = gr.Dataframe( | |
value=retrieval_df.loc[:, retrieval_columns_to_show], | |
datatype="markdown", | |
wrap=False, | |
show_fullscreen_button=True, | |
interactive=False, | |
column_widths=retrieval_columns_widths | |
) | |
# Submit the search box and the leaderboard | |
search_box_retrieval.input( | |
retrieval_search_leaderboard, | |
inputs=[search_box_retrieval, retrieval_columns_to_show_input], | |
outputs=retrieval_leaderboard | |
) | |
retrieval_columns_to_show_input.select( | |
update_retrieval_columns_to_show, | |
inputs=retrieval_columns_to_show_input, | |
outputs=retrieval_leaderboard | |
) | |
with gr.Tab("π΅οΈ Submit Retriever"): | |
submit_gradio_module("Retriever") | |
with gr.Tab("βΉοΈ About"): | |
gr.Markdown(RETRIEVAL_ABOUT_SECTION) | |
with gr.Tab("π Reranking"): | |
with gr.Tabs(): | |
with gr.Tab("π Leaderboard"): | |
with gr.Row(): | |
search_box_reranker = gr.Textbox( | |
placeholder="Search for models...", | |
label="Search", | |
scale=5 | |
) | |
reranking_columns_to_show_input = gr.CheckboxGroup( | |
label="Columns to Show", | |
choices=reranking_cols, # use cached list | |
value=reranking_columns_to_show, | |
scale=4 | |
) | |
reranker_leaderboard = gr.Dataframe( | |
value=reranking_df[reranking_columns_to_show], | |
datatype="markdown", | |
wrap=False, | |
show_fullscreen_button=True, | |
interactive=False, | |
column_widths=reranking_columns_widths | |
) | |
# Submit the search box and the leaderboard | |
search_box_reranker.input( | |
reranking_search_leaderboard, | |
inputs=[search_box_reranker, reranking_columns_to_show_input], | |
outputs=reranker_leaderboard | |
) | |
reranking_columns_to_show_input.select( | |
update_reranker_columns_to_show, | |
inputs=reranking_columns_to_show_input, | |
outputs=reranker_leaderboard | |
) | |
with gr.Tab("π΅οΈ Submit Reranker"): | |
submit_gradio_module("Reranker") | |
with gr.Tab("βΉοΈ About"): | |
gr.Markdown(RERANKER_ABOUT_SECTION) | |
with gr.Row(): | |
with gr.Accordion("π Citation", open=False): | |
gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
lines=20, | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
demo.launch() | |
if __name__ == "__main__": | |
main() | |