Spaces:

sartifyllc
/

Swahili-Text-Embeddings-Leaderboard

Running

App Files Files Community

Mollel commited on Jul 13, 2024

Commit

00b7e99

verified ·

1 Parent(s): 57671d9

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -78

app.py CHANGED Viewed

@@ -3,9 +3,10 @@ import pandas as pd
 import io
 import re
 # Constants
 GITHUB_URL = "https://github.com/Sartify/STEL"
-POSSIBLE_NON_BENCHMARK_COLS = ["Open?", "Publisher", "Basemodel", "Matryoshka", "Dimension", "Model Name"]
 def extract_table_from_markdown(markdown_text, table_start):
     """Extract table content from markdown text."""
@@ -15,7 +16,6 @@ def extract_table_from_markdown(markdown_text, table_start):
     for line in lines:
         if line.startswith(table_start):
             capture = True
-            continue
         if capture and line.strip() == '':
             break
         if capture:
@@ -24,17 +24,37 @@ def extract_table_from_markdown(markdown_text, table_start):
 def markdown_table_to_df(table_content):
     """Convert markdown table to pandas DataFrame."""
-    df = pd.read_csv(io.StringIO(table_content), sep='|', skipinitialspace=True)
-    df.columns = df.columns.str.strip()
-    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
-    df = df.dropna(axis=1, how='all')
     return df
 def setup_page():
     """Set up the Streamlit page."""
     st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="⚡", layout="wide")
     st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
-    st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300)
 def display_leaderboard(df):
     """Display the leaderboard."""
@@ -51,79 +71,13 @@ def display_leaderboard(df):
     df_display = df[present_non_benchmark_cols + selected_columns]
     # Display dataframe
-    st.dataframe(df_display)
     # Download buttons
     csv = df_display.to_csv(index=False)
     st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
-def display_evaluation():
-    """Display the evaluation section."""
-    st.header("🧪 Evaluation")
-    st.markdown("""
-    To evaluate a model on the Swahili Embeddings Text Benchmark, you can use the following Python script:
-    ```python
-    pip install mteb
-    pip install sentence-transformers
-    import mteb
-    from sentence_transformers import SentenceTransformer
-    models = ["sartifyllc/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka"]
-    for model_name in models:
-        truncate_dim = 768
-        language = "swa"
-        device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
-        model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
-        tasks = [
-            mteb.get_task("AfriSentiClassification", languages=["swa"]),
-            mteb.get_task("AfriSentiLangClassification", languages=["swa"]),
-            mteb.get_task("MasakhaNEWSClassification", languages=["swa"]),
-            mteb.get_task("MassiveIntentClassification", languages=["swa"]),
-            mteb.get_task("MassiveScenarioClassification", languages=["swa"]),
-            mteb.get_task("SwahiliNewsClassification", languages=["swa"]),
-        ]
-        evaluation = mteb.MTEB(tasks=tasks)
-        results = evaluation.run(model, output_folder=f"{model_name}")
-        tasks = mteb.get_tasks(task_types=["PairClassification", "Reranking", "BitextMining", "Clustering", "Retrieval"], languages=["swa"])
-        evaluation = mteb.MTEB(tasks=tasks)
-        results = evaluation.run(model, output_folder=f"{model_name}")
-    ```
-    """)
-def display_contribution():
-    """Display the contribution section."""
-    st.header("🤝 How to Contribute")
-    st.markdown("""
-    We welcome and appreciate all contributions! You can help by:
-    ### Table Work
-    - Filling in missing entries.
-    - New models are added as new rows to the leaderboard (maintaining descending order).
-    - Add new benchmarks as new columns in the leaderboard and include them in the benchmarks table (maintaining descending order).
-    ### Code Work
-    - Improving the existing code.
-    - Requesting and implementing new features.
-    """)
-def display_sponsorship():
-    """Display the sponsorship section."""
-    st.header("🤝 Sponsorship")
-    st.markdown("""
-    This benchmark is Swahili-based, and we need support translating and curating more tasks into Swahili.
-    Sponsorships are welcome to help advance this endeavour. Your sponsorship will facilitate essential
-    translation efforts, bridge language barriers, and make the benchmark accessible to a broader audience.
-    We are grateful for the dedication shown by our collaborators and aim to extend this impact further
-    with the support of sponsors committed to advancing language technologies.
-    """)
 def main():
     setup_page()
@@ -145,6 +99,4 @@ def main():
     st.markdown("Thank you for being part of this effort to advance Swahili language technologies!")
 if __name__ == "__main__":
-    main()

 import io
 import re
 # Constants
 GITHUB_URL = "https://github.com/Sartify/STEL"
+POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]
 def extract_table_from_markdown(markdown_text, table_start):
     """Extract table content from markdown text."""
     for line in lines:
         if line.startswith(table_start):
             capture = True
         if capture and line.strip() == '':
             break
         if capture:
 def markdown_table_to_df(table_content):
     """Convert markdown table to pandas DataFrame."""
+    # Split the table content into lines
+    lines = table_content.split('\n')
+    # Extract headers
+    headers = [h.strip() for h in lines[0].split('|') if h.strip()]
+    # Extract data
+    data = []
+    for line in lines[2:]:  # Skip the header separator line
+        row = [cell.strip() for cell in line.split('|') if cell.strip()]
+        if row:
+            data.append(row)
+    # Create DataFrame
+    df = pd.DataFrame(data, columns=headers)
+    # Convert numeric columns to float
+    for col in df.columns:
+        if df[col].dtype == object:
+            try:
+                df[col] = df[col].astype(float)
+            except ValueError:
+                pass  # Keep as string if conversion fails
     return df
 def setup_page():
     """Set up the Streamlit page."""
     st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="⚡", layout="wide")
     st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
+    st.image("https://raw.githubusercontent.com/username/repo/main/STEL.jpg", width=300)
 def display_leaderboard(df):
     """Display the leaderboard."""
     df_display = df[present_non_benchmark_cols + selected_columns]
     # Display dataframe
+    st.dataframe(df_display.style.format("{:.4f}", subset=selected_columns))
     # Download buttons
     csv = df_display.to_csv(index=False)
     st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
+# ... (rest of the code remains the same)
 def main():
     setup_page()
     st.markdown("Thank you for being part of this effort to advance Swahili language technologies!")
 if __name__ == "__main__":
+    main()