MohamedRashad's picture
Rename columns in reranking dataframe for clarity and update displayed columns
9440e3a
import gradio as gr
from utils import submit_gradio_module, load_retrieval_results, load_reranking_results
from fuzzywuzzy import fuzz
HEADER = """<div style="text-align: center; margin-bottom: 20px;">
<h1>The Arabic RAG Leaderboard</h1>
<p style="font-size: 16px; color: #888;">The only leaderboard you will require for your RAG needs πŸ†</p>
</div>
This leaderboard presents the first comprehensive benchmark for Arabic RAG systems, evaluating both retrieval and re-ranking components. Our framework combines real-world queries with synthetic contexts in a dynamic evaluation cycle, ensuring fair and robust assessment of Arabic information retrieval systems.
<br>
<br>
For technical details, check our blog post <a href="https://huggingface.co./blog/Navid-AI/arabic-rag-leaderboard">here</a>.
"""
RETRIEVAL_ABOUT_SECTION = """
## About Retrieval Evaluation
The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on:
### Web Search Dataset Metrics
- **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result
- **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results
- **Recall@5**: Measures the proportion of relevant documents found in the top 5 results
- **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5
### Model Requirements
- Must support Arabic text embeddings
- Should handle queries of at least 512 tokens
- Must work with `sentence-transformers` library
### Evaluation Process
1. Models process Arabic web search queries
2. Retrieved documents are evaluated using:
- MRR for first relevant result positioning
- nDCG for overall ranking quality
- Recall@5 for top results accuracy
3. Metrics are averaged to calculate the overall score
4. Models are ranked based on their overall performance
### How to Prepare Your Model
- Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet)
- Model should output fixed-dimension embeddings for text
- Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`)
"""
RERANKER_ABOUT_SECTION = """
## About Reranking Evaluation
The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance.
### Evaluation Metrics
- **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10
- **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10
- **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents
All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance.
### Model Requirements
- Must accept query-document pairs as input
- Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching)
- Support for Arabic text processing
### Evaluation Process
1. Models are tested on multiple unseen Arabic datasets
2. For each dataset:
- Initial candidate documents are provided
- Model reranks the candidates
- MRR@10, NDCG@10, and MAP are calculated
3. Final scores are averaged across all datasets
4. Models are ranked based on overall performance
### How to Prepare Your Model
- Model should be public on HuggingFace Hub (private models are not supported yet)
- Make sure it works coherently with `sentence-transformers` library
"""
CITATION_BUTTON_LABEL = """
Copy the following snippet to cite these results
"""
CITATION_BUTTON_TEXT = """
@misc{TARL,
author = {Mohaned A. Rashad, Hamza Shahid},
title = {The Arabic RAG Leaderboard},
year = {2025},
publisher = {Navid-AI},
howpublished = "url{https://huggingface.co./spaces/Navid-AI/The-Arabic-Rag-Leaderboard}"
}
"""
retrieval_df = None
reranking_df = None
def search_leaderboard(df, model_name, columns_to_show, threshold=95):
if not model_name.strip():
return df.loc[:, columns_to_show]
search_name = model_name.lower() # compute once for efficiency
def calculate_similarity(row):
return fuzz.partial_ratio(search_name, row["Model"].lower())
filtered_df = df.copy()
filtered_df["similarity"] = filtered_df.apply(calculate_similarity, axis=1)
filtered_df = filtered_df[filtered_df["similarity"] >= threshold].sort_values('similarity', ascending=False)
filtered_df = filtered_df.drop('similarity', axis=1).loc[:, columns_to_show]
return filtered_df
def retrieval_search_leaderboard(model_name, columns_to_show):
return search_leaderboard(retrieval_df, model_name, columns_to_show)
def reranking_search_leaderboard(model_name, columns_to_show):
return search_leaderboard(reranking_df, model_name, columns_to_show)
def update_retrieval_columns_to_show(columns_to_show):
global retrieval_df
dummy_df = retrieval_df.loc[:, [col for col in retrieval_df.columns if col in columns_to_show]]
columns_widths = []
for col in dummy_df.columns:
if col == "Rank":
columns_widths.append(80)
elif col == "Model":
columns_widths.append(400)
else:
columns_widths.append(150)
return gr.update(value=dummy_df, column_widths=columns_widths)
def update_reranker_columns_to_show(columns_to_show):
global reranking_df
dummy_df = reranking_df.loc[:, [col for col in reranking_df.columns if col in columns_to_show]]
columns_widths = []
for col in dummy_df.columns:
if col == "Rank":
columns_widths.append(80)
elif col == "Model":
columns_widths.append(400)
else:
columns_widths.append(150)
return gr.update(value=dummy_df, column_widths=columns_widths)
def main():
global retrieval_df, reranking_df
# Prepare retrieval dataframe
retrieval_df = load_retrieval_results(True, "Web Search Dataset (Overall Score)", ["Revision", "Precision", "Task"])
retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df)))
retrieval_df = retrieval_df[['Rank', 'Model', 'Web Search Dataset (Overall Score)', 'Model Size (MB)', 'Embedding Dimension', 'Max Tokens', 'Num Likes', 'Downloads Last Month', 'Web Search Dataset (MRR)', 'Web Search Dataset (nDCG@k=None)', 'Web Search Dataset (Recall@5)', 'License']]
retrieval_columns_to_show = ["Rank", "Model", "Web Search Dataset (Overall Score)", "Model Size (MB)", "Embedding Dimension", "Max Tokens", "Num Likes"]
retrieval_columns_widths = [80, 400, 150, 150, 150, 150, 150]
retrieval_cols = retrieval_df.columns.tolist() # cache columns
# Prepare reranking dataframe
reranking_df = load_reranking_results(True, sort_col="Overall Score", drop_cols=["Revision", "Precision", "Task"])
reranking_df.insert(0, "Rank", range(1, 1 + len(reranking_df)))
reranking_df.rename(columns={"nDCG": "nDCG@10", "MRR": "MRR@10"}, inplace=True)
reranking_columns_to_show = ["Rank", "Model", "Overall Score", "Model Parameters (in Millions)", "Embedding Dimensions", "Downloads Last Month", "MRR@10", "nDCG@10", "MAP"]
reranking_columns_widths = [80, 400, 150, 150, 150, 150, 150, 150, 150]
reranking_cols = reranking_df.columns.tolist() # cache columns
with gr.Blocks() as demo:
gr.HTML(HEADER)
with gr.Tabs():
with gr.Tab("πŸ•΅οΈβ€β™‚οΈ Retrieval"):
with gr.Tabs():
with gr.Tab("πŸ‘‘ Leaderboard"):
with gr.Row():
search_box_retrieval = gr.Textbox(
placeholder="Search for models...",
label="Search",
scale=5
)
retrieval_columns_to_show_input = gr.CheckboxGroup(
label="Columns to Show",
choices=retrieval_cols, # use cached list
value=retrieval_columns_to_show,
scale=4
)
retrieval_leaderboard = gr.Dataframe(
value=retrieval_df.loc[:, retrieval_columns_to_show],
datatype="markdown",
wrap=False,
show_fullscreen_button=True,
interactive=False,
column_widths=retrieval_columns_widths
)
# Submit the search box and the leaderboard
search_box_retrieval.input(
retrieval_search_leaderboard,
inputs=[search_box_retrieval, retrieval_columns_to_show_input],
outputs=retrieval_leaderboard
)
retrieval_columns_to_show_input.select(
update_retrieval_columns_to_show,
inputs=retrieval_columns_to_show_input,
outputs=retrieval_leaderboard
)
with gr.Tab("🏡️ Submit Retriever"):
submit_gradio_module("Retriever")
with gr.Tab("ℹ️ About"):
gr.Markdown(RETRIEVAL_ABOUT_SECTION)
with gr.Tab("πŸ“Š Reranking"):
with gr.Tabs():
with gr.Tab("πŸ‘‘ Leaderboard"):
with gr.Row():
search_box_reranker = gr.Textbox(
placeholder="Search for models...",
label="Search",
scale=5
)
reranking_columns_to_show_input = gr.CheckboxGroup(
label="Columns to Show",
choices=reranking_cols, # use cached list
value=reranking_columns_to_show,
scale=4
)
reranker_leaderboard = gr.Dataframe(
value=reranking_df[reranking_columns_to_show],
datatype="markdown",
wrap=False,
show_fullscreen_button=True,
interactive=False,
column_widths=reranking_columns_widths
)
# Submit the search box and the leaderboard
search_box_reranker.input(
reranking_search_leaderboard,
inputs=[search_box_reranker, reranking_columns_to_show_input],
outputs=reranker_leaderboard
)
reranking_columns_to_show_input.select(
update_reranker_columns_to_show,
inputs=reranking_columns_to_show_input,
outputs=reranker_leaderboard
)
with gr.Tab("🏡️ Submit Reranker"):
submit_gradio_module("Reranker")
with gr.Tab("ℹ️ About"):
gr.Markdown(RERANKER_ABOUT_SECTION)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
demo.launch()
if __name__ == "__main__":
main()