import gradio as gr from utils import submit_gradio_module, load_retrieval_results, load_reranking_results from fuzzywuzzy import fuzz HEADER = """

The Arabic RAG Leaderboard

The only leaderboard you will require for your RAG needs 🏆

This leaderboard presents the first comprehensive benchmark for Arabic RAG systems, evaluating both retrieval and re-ranking components. Our framework combines real-world queries with synthetic contexts in a dynamic evaluation cycle, ensuring fair and robust assessment of Arabic information retrieval systems.

For technical details, check our blog post here. """ RETRIEVAL_ABOUT_SECTION = """ ## About Retrieval Evaluation The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on: ### Web Search Dataset Metrics - **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result - **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results - **Recall@5**: Measures the proportion of relevant documents found in the top 5 results - **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5 ### Model Requirements - Must support Arabic text embeddings - Should handle queries of at least 512 tokens - Must work with `sentence-transformers` library ### Evaluation Process 1. Models process Arabic web search queries 2. Retrieved documents are evaluated using: - MRR for first relevant result positioning - nDCG for overall ranking quality - Recall@5 for top results accuracy 3. Metrics are averaged to calculate the overall score 4. Models are ranked based on their overall performance ### How to Prepare Your Model - Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet) - Model should output fixed-dimension embeddings for text - Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`) """ RERANKER_ABOUT_SECTION = """ ## About Reranking Evaluation The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance. ### Evaluation Metrics - **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10 - **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10 - **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance. ### Model Requirements - Must accept query-document pairs as input - Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching) - Support for Arabic text processing ### Evaluation Process 1. Models are tested on multiple unseen Arabic datasets 2. For each dataset: - Initial candidate documents are provided - Model reranks the candidates - MRR@10, NDCG@10, and MAP are calculated 3. Final scores are averaged across all datasets 4. Models are ranked based on overall performance ### How to Prepare Your Model - Model should be public on HuggingFace Hub (private models are not supported yet) - Make sure it works coherently with `sentence-transformers` library """ CITATION_BUTTON_LABEL = """ Copy the following snippet to cite these results """ CITATION_BUTTON_TEXT = """ @misc{TARL, author = {Mohaned A. Rashad, Hamza Shahid}, title = {The Arabic RAG Leaderboard}, year = {2025}, publisher = {Navid-AI}, howpublished = "url{https://huggingface.co./spaces/Navid-AI/The-Arabic-Rag-Leaderboard}" } """ retrieval_df = None reranking_df = None def search_leaderboard(df, model_name, columns_to_show, threshold=95): if not model_name.strip(): return df.loc[:, columns_to_show] search_name = model_name.lower() # compute once for efficiency def calculate_similarity(row): return fuzz.partial_ratio(search_name, row["Model"].lower()) filtered_df = df.copy() filtered_df["similarity"] = filtered_df.apply(calculate_similarity, axis=1) filtered_df = filtered_df[filtered_df["similarity"] >= threshold].sort_values('similarity', ascending=False) filtered_df = filtered_df.drop('similarity', axis=1).loc[:, columns_to_show] return filtered_df def retrieval_search_leaderboard(model_name, columns_to_show): return search_leaderboard(retrieval_df, model_name, columns_to_show) def reranking_search_leaderboard(model_name, columns_to_show): return search_leaderboard(reranking_df, model_name, columns_to_show) def update_retrieval_columns_to_show(columns_to_show): global retrieval_df dummy_df = retrieval_df.loc[:, [col for col in retrieval_df.columns if col in columns_to_show]] columns_widths = [] for col in dummy_df.columns: if col == "Rank": columns_widths.append(80) elif col == "Model": columns_widths.append(400) else: columns_widths.append(150) return gr.update(value=dummy_df, column_widths=columns_widths) def update_reranker_columns_to_show(columns_to_show): global reranking_df dummy_df = reranking_df.loc[:, [col for col in reranking_df.columns if col in columns_to_show]] columns_widths = [] for col in dummy_df.columns: if col == "Rank": columns_widths.append(80) elif col == "Model": columns_widths.append(400) else: columns_widths.append(150) return gr.update(value=dummy_df, column_widths=columns_widths) def main(): global retrieval_df, reranking_df # Prepare retrieval dataframe retrieval_df = load_retrieval_results(True, "Web Search Dataset (Overall Score)", ["Revision", "Precision", "Task"]) retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df))) retrieval_df = retrieval_df[['Rank', 'Model', 'Web Search Dataset (Overall Score)', 'Model Size (MB)', 'Embedding Dimension', 'Max Tokens', 'Num Likes', 'Downloads Last Month', 'Web Search Dataset (MRR)', 'Web Search Dataset (nDCG@k=None)', 'Web Search Dataset (Recall@5)', 'License']] retrieval_columns_to_show = ["Rank", "Model", "Web Search Dataset (Overall Score)", "Model Size (MB)", "Embedding Dimension", "Max Tokens", "Num Likes"] retrieval_columns_widths = [80, 400, 150, 150, 150, 150, 150] retrieval_cols = retrieval_df.columns.tolist() # cache columns # Prepare reranking dataframe reranking_df = load_reranking_results(True, sort_col="Overall Score", drop_cols=["Revision", "Precision", "Task"]) reranking_df.insert(0, "Rank", range(1, 1 + len(reranking_df))) reranking_df.rename(columns={"nDCG": "nDCG@10", "MRR": "MRR@10"}, inplace=True) reranking_columns_to_show = ["Rank", "Model", "Overall Score", "Model Parameters (in Millions)", "Embedding Dimensions", "Downloads Last Month", "MRR@10", "nDCG@10", "MAP"] reranking_columns_widths = [80, 400, 150, 150, 150, 150, 150, 150, 150] reranking_cols = reranking_df.columns.tolist() # cache columns with gr.Blocks() as demo: gr.HTML(HEADER) with gr.Tabs(): with gr.Tab("đŸ•ĩī¸â€â™‚ī¸ Retrieval"): with gr.Tabs(): with gr.Tab("👑 Leaderboard"): with gr.Row(): search_box_retrieval = gr.Textbox( placeholder="Search for models...", label="Search", scale=5 ) retrieval_columns_to_show_input = gr.CheckboxGroup( label="Columns to Show", choices=retrieval_cols, # use cached list value=retrieval_columns_to_show, scale=4 ) retrieval_leaderboard = gr.Dataframe( value=retrieval_df.loc[:, retrieval_columns_to_show], datatype="markdown", wrap=False, show_fullscreen_button=True, interactive=False, column_widths=retrieval_columns_widths ) # Submit the search box and the leaderboard search_box_retrieval.input( retrieval_search_leaderboard, inputs=[search_box_retrieval, retrieval_columns_to_show_input], outputs=retrieval_leaderboard ) retrieval_columns_to_show_input.select( update_retrieval_columns_to_show, inputs=retrieval_columns_to_show_input, outputs=retrieval_leaderboard ) with gr.Tab("đŸĩī¸ Submit Retriever"): submit_gradio_module("Retriever") with gr.Tab("ℹī¸ About"): gr.Markdown(RETRIEVAL_ABOUT_SECTION) with gr.Tab("📊 Reranking"): with gr.Tabs(): with gr.Tab("👑 Leaderboard"): with gr.Row(): search_box_reranker = gr.Textbox( placeholder="Search for models...", label="Search", scale=5 ) reranking_columns_to_show_input = gr.CheckboxGroup( label="Columns to Show", choices=reranking_cols, # use cached list value=reranking_columns_to_show, scale=4 ) reranker_leaderboard = gr.Dataframe( value=reranking_df[reranking_columns_to_show], datatype="markdown", wrap=False, show_fullscreen_button=True, interactive=False, column_widths=reranking_columns_widths ) # Submit the search box and the leaderboard search_box_reranker.input( reranking_search_leaderboard, inputs=[search_box_reranker, reranking_columns_to_show_input], outputs=reranker_leaderboard ) reranking_columns_to_show_input.select( update_reranker_columns_to_show, inputs=reranking_columns_to_show_input, outputs=reranker_leaderboard ) with gr.Tab("đŸĩī¸ Submit Reranker"): submit_gradio_module("Reranker") with gr.Tab("ℹī¸ About"): gr.Markdown(RERANKER_ABOUT_SECTION) with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) demo.launch() if __name__ == "__main__": main()