Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
612a826
1
Parent(s):
a3b4e99
remove unused code
Browse files
app.py
CHANGED
@@ -74,7 +74,7 @@ def plot_and_df(texts, preds):
|
|
74 |
)
|
75 |
# counts.reset_index(inplace=True)
|
76 |
return (
|
77 |
-
gr.BarPlot(counts_df, x="quality", y="count", sort=None),
|
78 |
texts_df[texts_df["quality"] == "Low"][["text"]][:min(texts_df.shape[0], 20)],
|
79 |
texts_df[texts_df["quality"] == "Medium"][["text"]][:min(texts_df.shape[0], 20)],
|
80 |
texts_df[texts_df["quality"] == "High"][["text"]][:min(texts_df.shape[0], 20)],
|
@@ -117,15 +117,6 @@ def run_quality_check(dataset, config, split, column, batch_size, num_examples):
|
|
117 |
texts_processed.extend(batch_texts)
|
118 |
yield {"check in progress...": i / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
|
119 |
|
120 |
-
# with multiprocessing.Pool(processes=8) as pool:
|
121 |
-
# props = pool.map(proportion_non_ascii, texts)
|
122 |
-
#
|
123 |
-
# # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
|
124 |
-
# plt.hist(props, bins=20, range=(0., 1.))
|
125 |
-
# plt.title('Histogram of proportion of non-ASCII characters')
|
126 |
-
# plt.xlabel('Proportion of non-ASCII characters')
|
127 |
-
# plt.ylabel('Number of texts')
|
128 |
-
|
129 |
yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), data_sample
|
130 |
|
131 |
|
@@ -215,35 +206,6 @@ def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_
|
|
215 |
yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
|
216 |
|
217 |
|
218 |
-
def proportion_non_ascii(s):
|
219 |
-
"""
|
220 |
-
Compute the proportion of non-ASCII characters in a string.
|
221 |
-
|
222 |
-
Parameters:
|
223 |
-
s (str): The input string.
|
224 |
-
|
225 |
-
Returns:
|
226 |
-
float: The proportion of non-ASCII characters in the string.
|
227 |
-
"""
|
228 |
-
non_ascii_count = sum(1 for c in s if ord(c) > 127)
|
229 |
-
total_chars = len(s)
|
230 |
-
return non_ascii_count / total_chars if total_chars > 0 else 0.0
|
231 |
-
|
232 |
-
|
233 |
-
def non_ascii_check(texts_df, column_name):
|
234 |
-
texts = texts_df[column_name].to_list()
|
235 |
-
with multiprocessing.Pool(processes=8) as pool:
|
236 |
-
props = pool.map(proportion_non_ascii, texts)
|
237 |
-
|
238 |
-
# non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
|
239 |
-
plt.hist(props, bins=20, range=(0., 1.))
|
240 |
-
plt.title('Histogram of proportion of non-ASCII characters')
|
241 |
-
plt.xlabel('Proportion of non-ASCII characters')
|
242 |
-
plt.ylabel('Number of texts')
|
243 |
-
|
244 |
-
return plt.gcf()
|
245 |
-
|
246 |
-
|
247 |
with gr.Blocks() as demo:
|
248 |
gr.Markdown(
|
249 |
"""
|
@@ -355,14 +317,6 @@ with gr.Blocks() as demo:
|
|
355 |
outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
|
356 |
)
|
357 |
|
358 |
-
# gr.Markdown("""## Compute text quality measures
|
359 |
-
# * proportion of non-ascii characters
|
360 |
-
# * #TODO""")
|
361 |
-
# gr_ascii_btn = gr.Button("Data measures")
|
362 |
-
# non_ascii_hist = gr.Plot()
|
363 |
-
#
|
364 |
-
# gr_ascii_btn.click(non_ascii_check, inputs=[texts_df, text_column], outputs=[non_ascii_hist])
|
365 |
-
|
366 |
gr.Markdown("## Explore toxicity")
|
367 |
# checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
|
368 |
gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")
|
|
|
74 |
)
|
75 |
# counts.reset_index(inplace=True)
|
76 |
return (
|
77 |
+
gr.BarPlot(counts_df, x="quality", y="count", sort=None, color="blue"),
|
78 |
texts_df[texts_df["quality"] == "Low"][["text"]][:min(texts_df.shape[0], 20)],
|
79 |
texts_df[texts_df["quality"] == "Medium"][["text"]][:min(texts_df.shape[0], 20)],
|
80 |
texts_df[texts_df["quality"] == "High"][["text"]][:min(texts_df.shape[0], 20)],
|
|
|
117 |
texts_processed.extend(batch_texts)
|
118 |
yield {"check in progress...": i / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), data_sample
|
121 |
|
122 |
|
|
|
206 |
yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
|
207 |
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
with gr.Blocks() as demo:
|
210 |
gr.Markdown(
|
211 |
"""
|
|
|
317 |
outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
|
318 |
)
|
319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
gr.Markdown("## Explore toxicity")
|
321 |
# checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
|
322 |
gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")
|