Mollel commited on
Commit
00b7e99
1 Parent(s): 57671d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -78
app.py CHANGED
@@ -3,9 +3,10 @@ import pandas as pd
3
  import io
4
  import re
5
 
 
6
  # Constants
7
  GITHUB_URL = "https://github.com/Sartify/STEL"
8
- POSSIBLE_NON_BENCHMARK_COLS = ["Open?", "Publisher", "Basemodel", "Matryoshka", "Dimension", "Model Name"]
9
 
10
  def extract_table_from_markdown(markdown_text, table_start):
11
  """Extract table content from markdown text."""
@@ -15,7 +16,6 @@ def extract_table_from_markdown(markdown_text, table_start):
15
  for line in lines:
16
  if line.startswith(table_start):
17
  capture = True
18
- continue
19
  if capture and line.strip() == '':
20
  break
21
  if capture:
@@ -24,17 +24,37 @@ def extract_table_from_markdown(markdown_text, table_start):
24
 
25
  def markdown_table_to_df(table_content):
26
  """Convert markdown table to pandas DataFrame."""
27
- df = pd.read_csv(io.StringIO(table_content), sep='|', skipinitialspace=True)
28
- df.columns = df.columns.str.strip()
29
- df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
30
- df = df.dropna(axis=1, how='all')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  return df
32
 
33
  def setup_page():
34
  """Set up the Streamlit page."""
35
  st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="⚡", layout="wide")
36
  st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
37
- st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300)
38
 
39
  def display_leaderboard(df):
40
  """Display the leaderboard."""
@@ -51,79 +71,13 @@ def display_leaderboard(df):
51
  df_display = df[present_non_benchmark_cols + selected_columns]
52
 
53
  # Display dataframe
54
- st.dataframe(df_display)
55
 
56
  # Download buttons
57
  csv = df_display.to_csv(index=False)
58
  st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
59
 
60
- def display_evaluation():
61
- """Display the evaluation section."""
62
- st.header("🧪 Evaluation")
63
- st.markdown("""
64
- To evaluate a model on the Swahili Embeddings Text Benchmark, you can use the following Python script:
65
- ```python
66
- pip install mteb
67
- pip install sentence-transformers
68
- import mteb
69
- from sentence_transformers import SentenceTransformer
70
-
71
- models = ["sartifyllc/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka"]
72
-
73
- for model_name in models:
74
- truncate_dim = 768
75
- language = "swa"
76
-
77
- device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
78
- model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
79
-
80
- tasks = [
81
- mteb.get_task("AfriSentiClassification", languages=["swa"]),
82
- mteb.get_task("AfriSentiLangClassification", languages=["swa"]),
83
- mteb.get_task("MasakhaNEWSClassification", languages=["swa"]),
84
- mteb.get_task("MassiveIntentClassification", languages=["swa"]),
85
- mteb.get_task("MassiveScenarioClassification", languages=["swa"]),
86
- mteb.get_task("SwahiliNewsClassification", languages=["swa"]),
87
- ]
88
-
89
- evaluation = mteb.MTEB(tasks=tasks)
90
- results = evaluation.run(model, output_folder=f"{model_name}")
91
-
92
- tasks = mteb.get_tasks(task_types=["PairClassification", "Reranking", "BitextMining", "Clustering", "Retrieval"], languages=["swa"])
93
-
94
- evaluation = mteb.MTEB(tasks=tasks)
95
- results = evaluation.run(model, output_folder=f"{model_name}")
96
- ```
97
- """)
98
-
99
- def display_contribution():
100
- """Display the contribution section."""
101
- st.header("🤝 How to Contribute")
102
- st.markdown("""
103
- We welcome and appreciate all contributions! You can help by:
104
-
105
- ### Table Work
106
-
107
- - Filling in missing entries.
108
- - New models are added as new rows to the leaderboard (maintaining descending order).
109
- - Add new benchmarks as new columns in the leaderboard and include them in the benchmarks table (maintaining descending order).
110
-
111
- ### Code Work
112
-
113
- - Improving the existing code.
114
- - Requesting and implementing new features.
115
- """)
116
-
117
- def display_sponsorship():
118
- """Display the sponsorship section."""
119
- st.header("🤝 Sponsorship")
120
- st.markdown("""
121
- This benchmark is Swahili-based, and we need support translating and curating more tasks into Swahili.
122
- Sponsorships are welcome to help advance this endeavour. Your sponsorship will facilitate essential
123
- translation efforts, bridge language barriers, and make the benchmark accessible to a broader audience.
124
- We are grateful for the dedication shown by our collaborators and aim to extend this impact further
125
- with the support of sponsors committed to advancing language technologies.
126
- """)
127
 
128
  def main():
129
  setup_page()
@@ -145,6 +99,4 @@ def main():
145
  st.markdown("Thank you for being part of this effort to advance Swahili language technologies!")
146
 
147
  if __name__ == "__main__":
148
- main()
149
-
150
-
 
3
  import io
4
  import re
5
 
6
+
7
  # Constants
8
  GITHUB_URL = "https://github.com/Sartify/STEL"
9
+ POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]
10
 
11
  def extract_table_from_markdown(markdown_text, table_start):
12
  """Extract table content from markdown text."""
 
16
  for line in lines:
17
  if line.startswith(table_start):
18
  capture = True
 
19
  if capture and line.strip() == '':
20
  break
21
  if capture:
 
24
 
25
  def markdown_table_to_df(table_content):
26
  """Convert markdown table to pandas DataFrame."""
27
+ # Split the table content into lines
28
+ lines = table_content.split('\n')
29
+
30
+ # Extract headers
31
+ headers = [h.strip() for h in lines[0].split('|') if h.strip()]
32
+
33
+ # Extract data
34
+ data = []
35
+ for line in lines[2:]: # Skip the header separator line
36
+ row = [cell.strip() for cell in line.split('|') if cell.strip()]
37
+ if row:
38
+ data.append(row)
39
+
40
+ # Create DataFrame
41
+ df = pd.DataFrame(data, columns=headers)
42
+
43
+ # Convert numeric columns to float
44
+ for col in df.columns:
45
+ if df[col].dtype == object:
46
+ try:
47
+ df[col] = df[col].astype(float)
48
+ except ValueError:
49
+ pass # Keep as string if conversion fails
50
+
51
  return df
52
 
53
  def setup_page():
54
  """Set up the Streamlit page."""
55
  st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="⚡", layout="wide")
56
  st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
57
+ st.image("https://raw.githubusercontent.com/username/repo/main/STEL.jpg", width=300)
58
 
59
  def display_leaderboard(df):
60
  """Display the leaderboard."""
 
71
  df_display = df[present_non_benchmark_cols + selected_columns]
72
 
73
  # Display dataframe
74
+ st.dataframe(df_display.style.format("{:.4f}", subset=selected_columns))
75
 
76
  # Download buttons
77
  csv = df_display.to_csv(index=False)
78
  st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
79
 
80
+ # ... (rest of the code remains the same)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  def main():
83
  setup_page()
 
99
  st.markdown("Thank you for being part of this effort to advance Swahili language technologies!")
100
 
101
  if __name__ == "__main__":
102
+ main()