SamoXXX commited on
Commit
1aeeb67
·
verified ·
1 Parent(s): b740c26

Update visuals and description

Browse files
Files changed (1) hide show
  1. app.py +130 -89
app.py CHANGED
@@ -5,6 +5,7 @@ import seaborn as sns
5
  import plotly.graph_objects as go
6
  import plotly.express as px
7
  from st_social_media_links import SocialMediaIcons
 
8
 
9
 
10
  AVERAGE_COLUMN_NAME = "Average"
@@ -33,12 +34,43 @@ def style_dataframe(df: pd.DataFrame):
33
 
34
  def styler(df: pd.DataFrame):
35
  palette = sns.color_palette("RdYlGn", as_cmap=True)
36
- styled_df = df.style.background_gradient(cmap=palette, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME]).format(precision=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  return styled_df
38
 
39
 
40
  ### Streamlit app
41
  st.set_page_config(layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  st.markdown("""
44
  <style>
@@ -53,7 +85,6 @@ st.markdown("""
53
  """, unsafe_allow_html=True)
54
 
55
  ### Prepare layout
56
- st.subheader("")
57
 
58
  st.markdown("""
59
  <style>
@@ -66,73 +97,12 @@ st.markdown("""
66
  .center-text {
67
  text-align: center;
68
  }
69
- .table-responsive {
70
- text-align: center;
71
- font-size: 0.9em;
72
- margin-left: 0%;
73
- margin-right: 0%;
74
- overflow-x: auto;
75
- -ms-overflow-style: 3px; /* Internet Explorer 10+ */
76
- scrollbar-width: thin; /* Firefox */
77
- }
78
- .table-responsive::-webkit-scrollbar {
79
- /*display: none;*/ /* Safari and Chrome */
80
- width: 6px;
81
- }
82
-
83
- #table_id {
84
- display: block;
85
- }
86
-
87
- #table_id th {
88
- display: inline-block;
89
- }
90
-
91
- #table_id td {
92
- padding-left: 0.7rem;
93
- padding-right: 0.7rem;
94
- display: inline-block;
95
- }
96
- #table_id td:hover {
97
- color:#FDA428;
98
- }
99
 
100
- a:link {color:#A85E00;} /* unvisited link */
101
  a:hover {color:#FDA428;} /* Mouse over link */
102
- a:visited {color:#A85E00;} /* visited link */
103
- a:active {color:#A85E00;} /* selected link */
104
-
105
- .image-container {
106
- position: relative;
107
- display: inline-block;
108
- transition: transform 0.3s ease;
109
- }
110
-
111
- .image-container img {
112
- vertical-align: middle;
113
- }
114
 
115
- .image-container::after {
116
- content: "";
117
- position: absolute;
118
- left: 0;
119
- bottom: 0;
120
- width: 100%;
121
- height: 2px;
122
- background-color: #FDA428; /* Change this to your desired color */
123
- transform: scaleX(0);
124
- transition: transform 0.3s ease;
125
- }
126
-
127
- .image-container:hover {
128
- transform: translateY(-3px); /* Change the value to adjust the upward movement */
129
- }
130
-
131
- .image-container:hover::after {
132
- transform: scaleX(1);
133
- }
134
-
135
- /* ---------------------------------------------------------------- */
136
  </style>
137
  """, unsafe_allow_html=True)
138
 
@@ -159,35 +129,47 @@ white_color = "#FFFFFF"
159
  black_color = "#000000"
160
  links_color = white_color
161
 
162
- # if theme.get('background_color') == white_color:
163
- # links_color = black_color
 
 
 
 
 
 
 
 
 
164
 
165
  social_media_links_colors = [
166
- dark_orange,
167
- dark_orange,
168
- dark_orange,
169
- dark_orange,
170
- dark_orange
171
  ]
172
 
173
  social_media_icons = SocialMediaIcons(social_media_links, social_media_links_colors)
174
  social_media_icons.render(justify_content='right')
175
 
 
 
 
 
 
176
  # Add logo, title, and subheader in a flexible container with equal spacing
177
  st.markdown("""
178
  <div class="header-container">
179
- <br>
180
- <img src="https://speakleash.org/wp-content/uploads/2023/09/SpeakLeash_logo.svg" alt="SpeakLeash Logo">
181
- <hr>
182
  <div class="title-container">
183
- <h1 style='color: #FDA428; margin-top: -1rem; font-size: 3.1em;'>Phrase-Bench</h1>
184
  <h3 style="margin-top: 0;">Understanding of Polish text, sentiment and phraseological compounds</h2>
185
  </div>
186
  </div>
187
  """, unsafe_allow_html=True)
188
 
189
  # Create tabs
190
- tab1, tab2 = st.tabs([RESULTS_COLUMN_NAME, "Opis"])
191
 
192
  with tab1:
193
  st.write("This benchmark evaluates the ability of language models to correctly interpret Polish texts with complex implicatures, such as sarcasm and idiomatic expressions. Models are assessed on sentiment analysis, understanding of true intentions, and identification of idiomatic phrases.")
@@ -204,15 +186,13 @@ with tab1:
204
  "Params": st.column_config.NumberColumn("Params [B]", format="%.1f"),
205
  AVERAGE_COLUMN_NAME: st.column_config.NumberColumn(AVERAGE_COLUMN_NAME),
206
  RESULTS_COLUMN_NAME: st.column_config.BarChartColumn(
207
- RESULTS_COLUMN_NAME, help="Summary of the results of each task",
208
  y_min=0,y_max=5,),
209
  SENTIMENT_COLUMN_NAME: st.column_config.NumberColumn(SENTIMENT_COLUMN_NAME, help='Ability to analyze sentiment'),
210
- PHRASEOLOGY_COLUMN_NAME: st.column_config.NumberColumn(PHRASEOLOGY_COLUMN_NAME, help='Ability to understand phraseological compounds'),
211
  UNDERSTANDING_COLUMN_NAME: st.column_config.NumberColumn(UNDERSTANDING_COLUMN_NAME, help='Ability to understand language'),
 
212
  }, hide_index=True, disabled=True, height=500)
213
 
214
- # st.divider()
215
-
216
  # Add selection for models and create a bar chart for selected models using the AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME
217
  # Add default selection of 3 best models from AVERAGE_COLUMN_NAME and 1 best model with "Bielik" in Model column
218
  default_models = list(data.sort_values(AVERAGE_COLUMN_NAME, ascending=False)['Model'].head(3))
@@ -220,17 +200,17 @@ with tab1:
220
  if bielik_model not in default_models:
221
  default_models.append(bielik_model)
222
  selected_models = st.multiselect("Select models to compare", data["Model"].unique(), default=default_models)
223
- # selected_models = st.multiselect("Select models to compare", data["Model"].unique())
224
  selected_data = data[data["Model"].isin(selected_models)]
225
  categories = [AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME]
226
 
227
  if selected_models:
228
  # Kolorki do wyboru:
229
  # colors = px.colors.sample_colorscale("viridis", len(selected_models)+1)
230
- colors = px.colors.qualitative.D3[:len(selected_models)]
231
- # Create a chart with bars for each model for each category
232
  fig_bars = go.Figure()
233
  for model, color in zip(selected_models, colors):
 
 
234
  values = selected_data[selected_data['Model'] == model][categories].values.flatten().tolist()
235
  fig_bars.add_trace(go.Bar(
236
  x=categories,
@@ -247,18 +227,79 @@ with tab1:
247
  yaxis_title="Score",
248
  template="plotly_dark"
249
  )
 
250
  st.plotly_chart(fig_bars)
251
 
252
 
253
  ### Zakładka 2 --> Opis
254
  with tab2:
255
- st.header("Opis")
256
- st.write("Tutaj znajduje się trochę tekstu jako wypełniacz.")
257
- st.write("To jest przykładowy tekst, który może zawierać dodatkowe informacje o benchmarku, metodologii, itp.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
 
260
  # Ending :)
261
- st.divider()
 
262
  st.markdown("""
263
  ### Authors:
264
  - [Jan Sowa](https://www.linkedin.com/in/janpiotrsowa) - leadership, writing texts, benchmark code
 
5
  import plotly.graph_objects as go
6
  import plotly.express as px
7
  from st_social_media_links import SocialMediaIcons
8
+ from streamlit_javascript import st_javascript
9
 
10
 
11
  AVERAGE_COLUMN_NAME = "Average"
 
34
 
35
  def styler(df: pd.DataFrame):
36
  palette = sns.color_palette("RdYlGn", as_cmap=True)
37
+ styled_df = df.style.set_table_styles(
38
+ [{
39
+ 'selector': 'th',
40
+ 'props': [
41
+ ('background-color', '#4CAF50'),
42
+ ('color', 'white'),
43
+ ('font-family', 'Arial, sans-serif'),
44
+ ('font-size', '16px')
45
+ ]
46
+ },
47
+ {
48
+ 'selector': 'td, th',
49
+ 'props': [
50
+ ('border', '2px solid #4CAF50')
51
+ ]
52
+ }]
53
+ ).background_gradient(cmap=palette, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME]
54
+ ).format(precision=2
55
+ )
56
  return styled_df
57
 
58
 
59
  ### Streamlit app
60
  st.set_page_config(layout="wide")
61
+ # Check if background color is white using JavaScript
62
+ st.markdown("""
63
+ <script>
64
+ function checkBackgroundColor() {
65
+ var backgroundColor = getComputedStyle(document.body).backgroundColor;
66
+ var isWhite = backgroundColor === 'rgb(255, 255, 255)' || backgroundColor === '#FFFFFF';
67
+ return isWhite;
68
+ }
69
+
70
+ var isWhiteBackground = checkBackgroundColor();
71
+ Streamlit.setComponentValue('background_color', isWhiteBackground);
72
+ </script>
73
+ """, unsafe_allow_html=True)
74
 
75
  st.markdown("""
76
  <style>
 
85
  """, unsafe_allow_html=True)
86
 
87
  ### Prepare layout
 
88
 
89
  st.markdown("""
90
  <style>
 
97
  .center-text {
98
  text-align: center;
99
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ a:link {color:#FDA428;} /* unvisited link */
102
  a:hover {color:#FDA428;} /* Mouse over link */
103
+ a:visited {color:#FDA428;} /* visited link */
104
+ a:active {color:#FDA428;} /* selected link */
 
 
 
 
 
 
 
 
 
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  </style>
107
  """, unsafe_allow_html=True)
108
 
 
129
  black_color = "#000000"
130
  links_color = white_color
131
 
132
+ return_value = False
133
+
134
+ return_value = st_javascript("""function darkMode(i){return (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches)}(1)""")
135
+ print("IS dark mode? return_value: ", return_value)
136
+
137
+ ### Retrieve the background color information in Streamlit
138
+ # if st.query_params.get('background_color') == 'true':
139
+ if not return_value:
140
+ links_color = black_color
141
+ else:
142
+ links_color = white_color
143
 
144
  social_media_links_colors = [
145
+ links_color,
146
+ links_color,
147
+ links_color,
148
+ links_color,
149
+ links_color
150
  ]
151
 
152
  social_media_icons = SocialMediaIcons(social_media_links, social_media_links_colors)
153
  social_media_icons.render(justify_content='right')
154
 
155
+ st.markdown("""
156
+ <br>
157
+ <img src="https://speakleash.org/wp-content/uploads/2023/09/SpeakLeash_logo.svg" alt="SpeakLeash Logo">
158
+ """, unsafe_allow_html=True)
159
+
160
  # Add logo, title, and subheader in a flexible container with equal spacing
161
  st.markdown("""
162
  <div class="header-container">
163
+ <br><br>
 
 
164
  <div class="title-container">
165
+ <h1 style='color: #FDA428; margin-top: -1rem; font-size: 3.1em;'>CPTUB - Complex Polish Text Understanging Benchmark</h1>
166
  <h3 style="margin-top: 0;">Understanding of Polish text, sentiment and phraseological compounds</h2>
167
  </div>
168
  </div>
169
  """, unsafe_allow_html=True)
170
 
171
  # Create tabs
172
+ tab1, tab2 = st.tabs([RESULTS_COLUMN_NAME, "Description"])
173
 
174
  with tab1:
175
  st.write("This benchmark evaluates the ability of language models to correctly interpret Polish texts with complex implicatures, such as sarcasm and idiomatic expressions. Models are assessed on sentiment analysis, understanding of true intentions, and identification of idiomatic phrases.")
 
186
  "Params": st.column_config.NumberColumn("Params [B]", format="%.1f"),
187
  AVERAGE_COLUMN_NAME: st.column_config.NumberColumn(AVERAGE_COLUMN_NAME),
188
  RESULTS_COLUMN_NAME: st.column_config.BarChartColumn(
189
+ "Bar chart of results", help="Summary of the results of each task",
190
  y_min=0,y_max=5,),
191
  SENTIMENT_COLUMN_NAME: st.column_config.NumberColumn(SENTIMENT_COLUMN_NAME, help='Ability to analyze sentiment'),
 
192
  UNDERSTANDING_COLUMN_NAME: st.column_config.NumberColumn(UNDERSTANDING_COLUMN_NAME, help='Ability to understand language'),
193
+ PHRASEOLOGY_COLUMN_NAME: st.column_config.NumberColumn(PHRASEOLOGY_COLUMN_NAME, help='Ability to understand phraseological compounds'),
194
  }, hide_index=True, disabled=True, height=500)
195
 
 
 
196
  # Add selection for models and create a bar chart for selected models using the AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME
197
  # Add default selection of 3 best models from AVERAGE_COLUMN_NAME and 1 best model with "Bielik" in Model column
198
  default_models = list(data.sort_values(AVERAGE_COLUMN_NAME, ascending=False)['Model'].head(3))
 
200
  if bielik_model not in default_models:
201
  default_models.append(bielik_model)
202
  selected_models = st.multiselect("Select models to compare", data["Model"].unique(), default=default_models)
 
203
  selected_data = data[data["Model"].isin(selected_models)]
204
  categories = [AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME]
205
 
206
  if selected_models:
207
  # Kolorki do wyboru:
208
  # colors = px.colors.sample_colorscale("viridis", len(selected_models)+1)
209
+ colors = px.colors.qualitative.G10[:len(selected_models)]
 
210
  fig_bars = go.Figure()
211
  for model, color in zip(selected_models, colors):
212
+ # Set y-axis range from 0 to 5
213
+
214
  values = selected_data[selected_data['Model'] == model][categories].values.flatten().tolist()
215
  fig_bars.add_trace(go.Bar(
216
  x=categories,
 
227
  yaxis_title="Score",
228
  template="plotly_dark"
229
  )
230
+ fig_bars.update_yaxes(range=[0, 5.1])
231
  st.plotly_chart(fig_bars)
232
 
233
 
234
  ### Zakładka 2 --> Opis
235
  with tab2:
236
+ st.markdown("""
237
+ ### <span style='text-decoration: #FDA428 wavy underline;'>**Cause of Creation**</span>
238
+ 1. **Need**: Models face significant challenges when dealing with understanding complex, context-reliant texts that involve meanings implied beyond the literal content of a statement. Such cases include sarcasm, implicatures, and phraseological compounds.
239
+
240
+ Traditional sentiment classifiers typically rely on word-based features (e.g., identifying positive or negative words) to assess sentiment. However, with sarcasm, the literal meaning of words often contradicts the intended sentiment, making it difficult for models to accurately gauge tone. Sarcasm's context-dependence further complicates matters, as these classifiers typically lack the ability to grasp nuanced cues in context, especially when sarcasm is subtle.
241
+ Similarly, classifiers struggle with implicatures, where the underlying intent is implied rather than explicitly stated. Here, models fail to capture the full sentiment because they rely heavily on surface-level words, missing the non-literal meaning that often drives the sentiment.
242
+ Phraseological compounds add another layer of difficulty. These are fixed or semi-fixed expressions whose meanings cannot be directly inferred from the individual words. Language models, trained on word-level patterns, often misinterpret these expressions because they fail to recognize the idiomatic or non-literal meaning, leading to inaccurate sentiment analysis.
243
+ In addition to sentiment analysis, we decided to include the understanding of more complex texts in the benchmark, which was measured by the ability to uncover the intended meaning.
244
+
245
+ ### <span style='text-decoration: #FDA428 wavy underline;'>**Dataset Information**</span>
246
+ The dataset contains 200 examples, all written in Polish. Each example consists of the following:
247
+ - **Main Text**: This is a statement (often an opinion) on any topic that includes a certain type of implicature, often several simultaneously, such as sarcasm, phraseological compounds or implicatures.
248
+ - **Reference Sentiment**: The sentiment associated with the main text. We use three categories: negative, neutral, and positive. Ambiguous examples were labeled as "neutral" to exclude them from sentiment classification testing.
249
+ - **Reference phraseological compounds**: A list of phraseological compounds found in the main text.
250
+ - **Reference Explanation**: An explanation of the underlying intentions that the author of the main text might have had.
251
+
252
+ ### <span style='text-decoration: #FDA428 wavy underline;'>**Evaluation Procedure**</span>
253
+ We distinguish between two models in the evaluation process:
254
+ - **Evaluated Model**: The model that performs specific tasks, is then assessed based on its performance, and added to a ranking.
255
+ - **Judge Metamodel**: One of the currently strongest, most versatile LLMs.
256
+
257
+ ### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING RESPONSES FROM THE EVALUATED MODEL**</span>
258
+ 1. For each text in the dataset, the evaluated model was required to list the following in three points:
259
+ - The sentiment (only positive/negative).
260
+ - The underlying intentions of the author of the text.
261
+ - All phraseological compounds present in the text along with their meanings in the given context.
262
+ 2. No system prompt is used. The prompt provided to the evaluated model is written in Polish, as we are testing the models in this language. It contains:
263
+ - **User Prompt**: 3 elements, each consisting of a header written in capital letters and content enclosed in triple quotes:
264
+ - Information about the role of a careful linguist with extensive experience.
265
+ - The instruction to perform the three previously described tasks.
266
+ - The first example of a text that could be included in the dataset.
267
+ - **Assistant Prompt**: A human-written example answer for the first example text.
268
+ - **User Prompt**: A second example of a text that could be included in the dataset.
269
+ - **Assistant Prompt**: A human-written example answer for the second example text.
270
+ - **User Prompt**: The target text, based on which the evaluated model will be assessed.
271
+ 3. The decision to split the examples into user prompts and assistant prompts was made due to the better results achieved by the vast majority of models. The two examples were selected based on diversity: one has a negative sentiment and several phraseological compounds, while the other is positive and lacks phraseological compounds.
272
+
273
+ ### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING METAMODEL EVALUATIONS**</span>
274
+ 1. The purpose of the metamodel is to return the following evaluations:
275
+ - **Understanding of the Text**: A comparison of the evaluated model's response description to the reference explanation.
276
+ - **Sentiment Analysis**: An optional evaluation, only if the reference sentiment is "positive" or "negative." We made this decision to exclude texts that people might interpret ambiguously.
277
+ - **phraseological compounds**: The model is penalized for phrases not included in the reference phraseological compounds. In cases where there are no phraseological compounds, the highest score is awarded only if the model indicates the absence of such expressions — one point is deducted for each excess phrase until the score reaches zero.
278
+ 2. Each evaluation is provided in JSON format. Example of a full response from the metamodel:
279
+ ```json
280
+ {"WYDŹWIĘK": "5"}
281
+ {"OCENA": "4"}
282
+ {"ZWIĄZKI": "3"}
283
+ ```
284
+ 3. The judge metamodel's prompt structure is similar to that of the evaluated model's prompt. No system prompt is used. The prompt includes:
285
+ - **User Prompt**: 3 elements, each consisting of a header written in capital letters and content enclosed in triple quotes:
286
+ - **Role**: A reliable assistant who adheres to the instructions and does not perform any other tasks, nor enters any additional text in the response.
287
+ - **Task**: According to the description in point 1. The comparison of phraseological compounds has the most guidelines, so we noted that the model should focus on this as it is the most challenging step, and that its work will be evaluated based on this point.
288
+ - The first example of a potential response from the evaluated model along with the references.
289
+ - **Assistant Prompt**: An example response containing the evaluations.
290
+ - **User Prompt**: A second example of a potential response from the evaluated model along with the references.
291
+ - **Assistant Prompt**: An example response containing the evaluations for the second example.
292
+ - **User Prompt**: The actual response from the evaluated model and the references on which the metamodel will base its evaluations included in the benchmark.
293
+ 4. Here, the examples were also selected based on diversity. One includes a reference with a positive sentiment, while the other contains no reference sentiment at all (an example labeled as "neutral" in the dataset).
294
+ 5. It is worth explaining why we chose this particular process for evaluating phraseological compounds. Initially, we intended to check only those phrases included in the reference and ignore others in the evaluation. Unfortunately, this procedure favored models that provided many phrases that were not phraseological compounds.
295
+ Therefore, we decided to penalize models for phrases not included in the reference. We aimed to ensure that models were not penalized for providing phraseological compounds we had not included in the reference. After generating the responses, we collected phrases noted by several models and manually reviewed all references to identify phraseological compounds we might have missed.
296
+ A similar procedure was applied to sentiment analysis—we listed all examples where several models consistently recorded a different sentiment than the reference and reconsidered whether the examples could be interpreted differently than initially assumed.
297
+ """, unsafe_allow_html=True)
298
 
299
 
300
  # Ending :)
301
+ st.markdown("<hr style='border: 1px solid #A85E00;'>", unsafe_allow_html=True)
302
+ # st.divider()
303
  st.markdown("""
304
  ### Authors:
305
  - [Jan Sowa](https://www.linkedin.com/in/janpiotrsowa) - leadership, writing texts, benchmark code