polinaeterna HF staff commited on
Commit
612a826
·
1 Parent(s): a3b4e99

remove unused code

Browse files
Files changed (1) hide show
  1. app.py +1 -47
app.py CHANGED
@@ -74,7 +74,7 @@ def plot_and_df(texts, preds):
74
  )
75
  # counts.reset_index(inplace=True)
76
  return (
77
- gr.BarPlot(counts_df, x="quality", y="count", sort=None),
78
  texts_df[texts_df["quality"] == "Low"][["text"]][:min(texts_df.shape[0], 20)],
79
  texts_df[texts_df["quality"] == "Medium"][["text"]][:min(texts_df.shape[0], 20)],
80
  texts_df[texts_df["quality"] == "High"][["text"]][:min(texts_df.shape[0], 20)],
@@ -117,15 +117,6 @@ def run_quality_check(dataset, config, split, column, batch_size, num_examples):
117
  texts_processed.extend(batch_texts)
118
  yield {"check in progress...": i / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
119
 
120
- # with multiprocessing.Pool(processes=8) as pool:
121
- # props = pool.map(proportion_non_ascii, texts)
122
- #
123
- # # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
124
- # plt.hist(props, bins=20, range=(0., 1.))
125
- # plt.title('Histogram of proportion of non-ASCII characters')
126
- # plt.xlabel('Proportion of non-ASCII characters')
127
- # plt.ylabel('Number of texts')
128
-
129
  yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), data_sample
130
 
131
 
@@ -215,35 +206,6 @@ def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_
215
  yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
216
 
217
 
218
- def proportion_non_ascii(s):
219
- """
220
- Compute the proportion of non-ASCII characters in a string.
221
-
222
- Parameters:
223
- s (str): The input string.
224
-
225
- Returns:
226
- float: The proportion of non-ASCII characters in the string.
227
- """
228
- non_ascii_count = sum(1 for c in s if ord(c) > 127)
229
- total_chars = len(s)
230
- return non_ascii_count / total_chars if total_chars > 0 else 0.0
231
-
232
-
233
- def non_ascii_check(texts_df, column_name):
234
- texts = texts_df[column_name].to_list()
235
- with multiprocessing.Pool(processes=8) as pool:
236
- props = pool.map(proportion_non_ascii, texts)
237
-
238
- # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
239
- plt.hist(props, bins=20, range=(0., 1.))
240
- plt.title('Histogram of proportion of non-ASCII characters')
241
- plt.xlabel('Proportion of non-ASCII characters')
242
- plt.ylabel('Number of texts')
243
-
244
- return plt.gcf()
245
-
246
-
247
  with gr.Blocks() as demo:
248
  gr.Markdown(
249
  """
@@ -355,14 +317,6 @@ with gr.Blocks() as demo:
355
  outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
356
  )
357
 
358
- # gr.Markdown("""## Compute text quality measures
359
- # * proportion of non-ascii characters
360
- # * #TODO""")
361
- # gr_ascii_btn = gr.Button("Data measures")
362
- # non_ascii_hist = gr.Plot()
363
- #
364
- # gr_ascii_btn.click(non_ascii_check, inputs=[texts_df, text_column], outputs=[non_ascii_hist])
365
-
366
  gr.Markdown("## Explore toxicity")
367
  # checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
368
  gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")
 
74
  )
75
  # counts.reset_index(inplace=True)
76
  return (
77
+ gr.BarPlot(counts_df, x="quality", y="count", sort=None, color="blue"),
78
  texts_df[texts_df["quality"] == "Low"][["text"]][:min(texts_df.shape[0], 20)],
79
  texts_df[texts_df["quality"] == "Medium"][["text"]][:min(texts_df.shape[0], 20)],
80
  texts_df[texts_df["quality"] == "High"][["text"]][:min(texts_df.shape[0], 20)],
 
117
  texts_processed.extend(batch_texts)
118
  yield {"check in progress...": i / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
119
 
 
 
 
 
 
 
 
 
 
120
  yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), data_sample
121
 
122
 
 
206
  yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
207
 
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  with gr.Blocks() as demo:
210
  gr.Markdown(
211
  """
 
317
  outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
318
  )
319
 
 
 
 
 
 
 
 
 
320
  gr.Markdown("## Explore toxicity")
321
  # checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
322
  gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")