Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna HF staff commited on Sep 23, 2024

Commit

612a826

1 Parent(s): a3b4e99

remove unused code

Browse files

Files changed (1) hide show

app.py +1 -47

app.py CHANGED Viewed

@@ -74,7 +74,7 @@ def plot_and_df(texts, preds):
     )
     # counts.reset_index(inplace=True)
     return (
-            gr.BarPlot(counts_df, x="quality", y="count", sort=None),
             texts_df[texts_df["quality"] == "Low"][["text"]][:min(texts_df.shape[0], 20)],
             texts_df[texts_df["quality"] == "Medium"][["text"]][:min(texts_df.shape[0], 20)],
             texts_df[texts_df["quality"] == "High"][["text"]][:min(texts_df.shape[0], 20)],
@@ -117,15 +117,6 @@ def run_quality_check(dataset, config, split, column, batch_size, num_examples):
         texts_processed.extend(batch_texts)
         yield {"check in progress...": i / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
-    # with multiprocessing.Pool(processes=8) as pool:
-    #     props = pool.map(proportion_non_ascii, texts)
-    #
-    # # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
-    # plt.hist(props, bins=20, range=(0., 1.))
-    # plt.title('Histogram of proportion of non-ASCII characters')
-    # plt.xlabel('Proportion of non-ASCII characters')
-    # plt.ylabel('Number of texts')
     yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), data_sample
@@ -215,35 +206,6 @@ def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_
     yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
-def proportion_non_ascii(s):
-    """
-    Compute the proportion of non-ASCII characters in a string.
-    Parameters:
-    s (str): The input string.
-    Returns:
-    float: The proportion of non-ASCII characters in the string.
-    """
-    non_ascii_count = sum(1 for c in s if ord(c) > 127)
-    total_chars = len(s)
-    return non_ascii_count / total_chars if total_chars > 0 else 0.0
-def non_ascii_check(texts_df, column_name):
-    texts = texts_df[column_name].to_list()
-    with multiprocessing.Pool(processes=8) as pool:
-        props = pool.map(proportion_non_ascii, texts)
-    # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
-    plt.hist(props, bins=20, range=(0., 1.))
-    plt.title('Histogram of proportion of non-ASCII characters')
-    plt.xlabel('Proportion of non-ASCII characters')
-    plt.ylabel('Number of texts')
-    return plt.gcf()
 with gr.Blocks() as demo:
     gr.Markdown(
         """
@@ -355,14 +317,6 @@ with gr.Blocks() as demo:
         outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
     )
-    # gr.Markdown("""## Compute text quality measures
-    #             * proportion of non-ascii characters
-    #             * #TODO""")
-    # gr_ascii_btn = gr.Button("Data measures")
-    # non_ascii_hist = gr.Plot()
-    #
-    # gr_ascii_btn.click(non_ascii_check, inputs=[texts_df, text_column], outputs=[non_ascii_hist])
     gr.Markdown("## Explore toxicity")
     # checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
     gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")

     )
     # counts.reset_index(inplace=True)
     return (
+            gr.BarPlot(counts_df, x="quality", y="count", sort=None, color="blue"),
             texts_df[texts_df["quality"] == "Low"][["text"]][:min(texts_df.shape[0], 20)],
             texts_df[texts_df["quality"] == "Medium"][["text"]][:min(texts_df.shape[0], 20)],
             texts_df[texts_df["quality"] == "High"][["text"]][:min(texts_df.shape[0], 20)],
         texts_processed.extend(batch_texts)
         yield {"check in progress...": i / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
     yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), data_sample
     yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
     )
     gr.Markdown("## Explore toxicity")
     # checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
     gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")