lhoestq HF staff commited on
Commit
0b712fa
·
1 Parent(s): 91893b1

workaround dataframe bug

Browse files
Files changed (1) hide show
  1. app.py +241 -203
app.py CHANGED
@@ -30,6 +30,52 @@ nltk.download('punkt_tab')
30
  DUMP_TO_PROCESS = "CC-MAIN-2023-50"
31
  TIMEOUT = 600
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  make_gallery_image_buttons_js = """
34
  function load() {
35
  class ClassWatcher {
@@ -97,6 +143,13 @@ tr td {
97
  .grid-wrap {
98
  min-height: 0;
99
  }
 
 
 
 
 
 
 
100
  """
101
 
102
 
@@ -109,216 +162,201 @@ def prepare_as_list_or_none(text: str) -> Optional[list[str]]:
109
  def non_empty_list_or_none(input_list: list[str]) -> Optional[list[str]]:
110
  return input_list or None
111
 
112
- def build_code_snippet(steps, params=None):
113
- # TODO
114
- return (
115
- "```python\n"
116
- "TODO\n"
117
- "```"
118
- )
119
-
120
 
121
  with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
122
  state = gr.State({"selected_block": 0})
123
  gr.Markdown("# Common Crawl Pipeline Creator")
124
- gallery = gr.Gallery(
125
- blocks,
126
- columns=4,
127
- rows=2,
128
- label="Select step to edit",
129
- object_fit="scale-down",
130
- show_share_button=False,
131
- show_download_button=False,
132
- show_fullscreen_button=False,
133
- elem_id="pipeline-gallery",
134
- allow_preview=False,
135
- )
136
- gallery_image_buttons = [gr.Button(visible=False, elem_classes="block-button") for _ in blocks] # hack to simulate each image galery as a button, see `make_gallery_image_buttons_js``
137
- blocks_uis = []
138
- with gr.Column(visible=False) as col:
139
- blocks_uis.append(col)
140
- gr.Markdown("## 1. URL Filtering \n\nPerforms filtering based on samples urls.")
141
- with gr.Group():
142
- url_filtering_checkbox = gr.Checkbox(True, label="Enable")
143
- with gr.Accordion("Parameters", open=True) as acc:
144
- use_integrated_lists_checkbox = gr.Checkbox(True, label="use_integrated_lists", info="use the datatrove integrated lists of banned urls and words")
145
- with gr.Row():
146
- with gr.Column():
147
- extra_domain_textbox = gr.Textbox("", label="extra_domains", info="remove if the domain is present in `extra_domains`")
148
- extra_domain_textbox.prepare_parameter = prepare_as_list_or_none
149
- extra_urls_textbox = gr.Textbox("", label="extra_urls", info="remove if the full url is present on `extra_urls`")
150
- extra_urls_textbox.prepare_parameter = prepare_as_list_or_none
151
- with gr.Column():
152
- banned_words_textbox = gr.Textbox("", label="banned_words", info="remove if any word from `banned_words` is in the url")
153
- banned_words_textbox.prepare_parameter = prepare_as_list_or_none
154
- banned_subwords_textbox = gr.Textbox("", label="banned_subwords", info="remove if any word from `banned_subwords` is a substring of the url")
155
- banned_subwords_textbox.prepare_parameter = prepare_as_list_or_none
156
- with gr.Column():
157
- soft_banned_words_textbox = gr.Textbox("", label="soft_banned_words", info="remove if there are at least `soft_word_threshold` words from `soft_banned_words` in the url")
158
- soft_banned_words_textbox.prepare_parameter = prepare_as_list_or_none
159
- soft_word_threshold_slider = gr.Slider(0, 5, value=2, step=1, label="soft_word_threshold", info="remove if there are at least `soft_word_threshold` words from `soft_banned_words` in the url")
160
- url_filtering_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=url_filtering_checkbox, outputs=acc)
161
- url_filtering_parameters_components = [use_integrated_lists_checkbox, extra_domain_textbox, extra_urls_textbox, banned_words_textbox, banned_subwords_textbox, soft_banned_words_textbox, soft_word_threshold_slider]
162
- with gr.Column(visible=False) as col:
163
- blocks_uis.append(col)
164
- gr.Markdown("## 2. Text Extraction \n\nUses the [Trafilatura](https://trafilatura.readthedocs.io) extractor.")
165
- with gr.Group():
166
- text_extraction_checkbox = gr.Checkbox(True, label="Enable")
167
- with gr.Accordion("Parameters", open=True) as acc:
168
- with gr.Row():
169
- favour_precision_checkbox = gr.Checkbox(True, label="favour_precision", info="prefer less text but correct extraction")
170
- timeout_slider = gr.Slider(0.05, 0.5, value=0.1, step=0.05, label="timeout", info="the timeout for extraction, per document, in seconds")
171
- deduplicate_checkbox = gr.Checkbox(True, label="deduplicate", info="trafilatura's deduplicate option")
172
- text_extraction_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=text_extraction_checkbox, outputs=acc)
173
- text_extraction_parameters_components = [favour_precision_checkbox, timeout_slider, deduplicate_checkbox]
174
- with gr.Column(visible=False) as col:
175
- blocks_uis.append(col)
176
- gr.Markdown("## 3. Language Filtering \n\nUses the [fastext](https://fasttext.cc/docs/en/language-identification.html) language identification models.")
177
- with gr.Group():
178
- language_filtering_checkbox = gr.Checkbox(True, label="Enable")
179
- with gr.Accordion("Parameters", open=True) as acc:
180
- with gr.Row():
181
- languages_textbox = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), multiselect=True, label="languages", info="list of languages to keep. empty for all")
182
- languages_textbox.prepare_parameter = non_empty_list_or_none
183
- language_threshold_slider = gr.Slider(0, 1, value=0.65, step=0.05, label="language_threshold", info="minimum score to accept a document")
184
- language_filtering_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=language_filtering_checkbox, outputs=acc)
185
- language_filtering_parameters_components = [languages_textbox, language_threshold_slider]
186
- with gr.Column(visible=False) as col:
187
- blocks_uis.append(col)
188
- gr.Markdown("## 4. Gopher Filtering (repetitions) \n\nUses the [Gopher](https://huggingface.co/papers/2112.11446) text repetition filters.")
189
- with gr.Group():
190
- gopher_filtering_repetitions_checkbox = gr.Checkbox(True, label="Enable")
191
- with gr.Accordion("Parameters", open=True) as acc:
192
  with gr.Group():
193
- with gr.Row():
194
- language_dropdown1 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
195
- top_n_grams_textbox = gr.Textbox("(2, 0.2), (3, 0.18), (4, 0.16)", label="top_n_grams")
196
- top_n_grams_textbox.prepare_parameter = ast.literal_eval
197
- dup_n_grams_textbox = gr.Textbox("(5, 0.15), (6, 0.14), (7, 0.13), (8, 0.12), (9, 0.11), (10, 0.10)", label="dup_n_grams")
198
- dup_n_grams_textbox.prepare_parameter = ast.literal_eval
199
- with gr.Row():
200
- dup_line_frac_slider = gr.Slider(0, 1, value=0.3, step=0.05, label="dup_line_frac")
201
- dup_para_frac_slider = gr.Slider(0, 1, value=0.3, step=0.05, label="dup_para_frac")
202
- dup_line_char_frac_slider = gr.Slider(0, 1, value=0.2, step=0.05, label="dup_line_char_frac")
203
- dup_para_char_frac_slider = gr.Slider(0, 1, value=0.2, step=0.05, label="dup_para_char_frac")
204
- gopher_filtering_repetitions_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=gopher_filtering_repetitions_checkbox, outputs=acc)
205
- gopher_filtering_repetitions_parameters_components = [language_dropdown1, top_n_grams_textbox, dup_n_grams_textbox, dup_line_frac_slider, dup_para_frac_slider, dup_line_char_frac_slider, dup_para_char_frac_slider]
206
- with gr.Column(visible=False) as col:
207
- blocks_uis.append(col)
208
- gr.Markdown("## 8. PII Removal \n\nReplaces email addresses and ip addresses in the document text.")
209
- with gr.Group():
210
- pii_removal_checkbox = gr.Checkbox(True, label="Enable")
211
- with gr.Accordion("Parameters", open=True) as acc:
212
- with gr.Row():
213
- remove_emails_checkbox = gr.Checkbox(True, label="remove_emails", info="Replace email addresses")
214
- remove_ips_checkbox = gr.Checkbox(True, label="remove_ips", info="Replace IP addresses")
215
- only_remove_public_ips_checkbox = gr.Checkbox(True, label="only_remove_public_ips", info="by default we only replace public (and thus PII) IPs")
216
- with gr.Row():
217
- email_replacement_textbox = gr.Textbox("[email protected], [email protected]", label="email_replacement", info="strings to use as replacement. They will be used in a circular way")
218
- email_replacement_textbox.prepare_parameter = prepare_as_list_or_none
219
- ip_replacement_textbox = gr.Textbox("22.214.171.124, 126.96.36.199, 188.8.131.52, 184.108.40.206, 220.127.116.11, 18.104.22.168", label="ip_replacement", info="same as email_replacement but for IP addresses")
220
- ip_replacement_textbox.prepare_parameter = prepare_as_list_or_none
221
- pii_removal_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=pii_removal_checkbox, outputs=acc)
222
- pii_removal_parameters_components = [remove_emails_checkbox, remove_ips_checkbox, only_remove_public_ips_checkbox, email_replacement_textbox, ip_replacement_textbox]
223
- with gr.Column(visible=False) as col:
224
- blocks_uis.append(col)
225
- gr.Markdown("## 7. Custom Filters \n\nUses the [FineWeb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) custom text filters.")
226
- with gr.Group():
227
- custom_filters_checkbox = gr.Checkbox(True, label="Enable")
228
- with gr.Accordion("Parameters", open=True) as acc:
229
- with gr.Row():
230
- line_punct_thr_slider = gr.Slider(0, 1, value=0.12, step=0.01, label="line_punct_thr")
231
- line_punct_exclude_zero = gr.Checkbox(False, label="line_punct_exclude_zero")
232
- short_line_thr_slider = gr.Slider(0, 1, value=0.67, step=0.01, label="short_line_thr")
233
- short_line_length_slider = gr.Slider(0, 100, value=30, step=1, label="short_line_length")
234
- char_duplicates_ratio_slider = gr.Slider(0, 1, value=0.01, step=0.01, label="char_duplicates_ratio")
235
- new_line_ratio_slider = gr.Slider(0, 1, value=0.3, step=0.01, label="new_line_ratio")
236
- custom_filters_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=custom_filters_checkbox, outputs=acc)
237
- custom_filters_parameters_components = [line_punct_thr_slider, line_punct_exclude_zero, short_line_thr_slider, short_line_length_slider, char_duplicates_ratio_slider, new_line_ratio_slider]
238
- with gr.Column(visible=False) as col:
239
- blocks_uis.append(col)
240
- gr.Markdown("## 6. C4 Filters\n\nUses the [C4](https://huggingface.co/datasets/allenai/c4) text size and content filters.")
241
- with gr.Group():
242
- c4_filters_checkbox = gr.Checkbox(True, label="Enable")
243
- with gr.Accordion(" Parameters", open=True) as acc:
244
  with gr.Group():
245
- with gr.Row():
246
- split_paragraph_checkbox = gr.Checkbox(True, label="split_paragraph", info="disable to apply the filters to each sentence instead of to each line")
247
- with gr.Row():
248
- language_dropdown2 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
249
- min_num_sentences_slider = gr.Slider(0, 10, value=5, step=1, label="min_num_sentences", info="remove documents that do not have at least this number of sentences (after line filtering)")
250
- min_words_per_line_slider = gr.Slider(0, 10, value=3, step=1, label="min_words_per_line", info="drop lines without this min number of words")
251
- max_word_length_slider = gr.Slider(0, 2000, value=1000, step=10, label="max_word_length", info=" drop lines where at least one word has more than this number of characters")
252
- with gr.Row():
253
- remove_citations_checkbox = gr.Checkbox(True, label="remove_citations", info="remove wikipedia style citations from the text")
254
- filter_no_terminal_punct_checkbox = gr.Checkbox(True, label="filter_no_terminal_punct", info="remove lines without terminal punctuation marks")
255
- filter_lorem_ipsum_checkbox = gr.Checkbox(True, label="filter_lorem_ipsum", info="drop documents that contain 'lorem ipsum'")
256
- filter_javascript_checkbox = gr.Checkbox(True, label="filter_javascript", info="drop lines mentioning 'javascript'")
257
- filter_curly_bracket = gr.Checkbox(True, label="filter_curly_bracket", info="drop documents containing {")
258
- filter_policy = gr.Checkbox(True, label="filter_policy", info="drop lines containing any of the policy phrases (e.g. 'terms of use', 'use cookies')")
259
- c4_filters_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=c4_filters_checkbox, outputs=acc)
260
- c4_filters_parameters_components = [split_paragraph_checkbox, language_dropdown2, min_num_sentences_slider, min_words_per_line_slider, max_word_length_slider, remove_citations_checkbox, filter_no_terminal_punct_checkbox, filter_lorem_ipsum_checkbox, filter_javascript_checkbox, filter_curly_bracket, filter_policy]
261
- with gr.Column(visible=False) as col:
262
- blocks_uis.append(col)
263
- gr.Markdown("## 5. Gopher Filtering (quality) \n\nUses the [Gopher](https://huggingface.co/papers/2112.11446) text quality filters.")
264
- with gr.Group():
265
- gopher_filtering_quality_checkbox = gr.Checkbox(True, label="Enable")
266
- with gr.Accordion("Parameters", open=True) as acc:
267
  with gr.Group():
268
- with gr.Row():
269
- language_dropdown2 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
270
- min_doc_words_slider = gr.Slider(0, 1000, value=50, step=10, label="min_doc_words")
271
- max_doc_words_slider = gr.Slider(0, 200_000, value=100_000, step=10_000, label="max_doc_words")
272
- with gr.Row():
273
- min_avg_word_length_slider = gr.Slider(0, 20, value=3, step=1, label="min_avg_word_length")
274
- max_avg_word_length_slider = gr.Slider(0, 20, value=10, step=1, label="max_avg_word_length")
275
- with gr.Row():
276
- max_symbol_word_ratio_slider = gr.Slider(0, 1, value=0.1, step=0.05, label="max_symbol_word_ratio")
277
- max_bullet_lines_ratio_slider = gr.Slider(0, 1, value=0.9, step=0.05, label="max_bullet_lines_ratio")
278
- max_ellipsis_lines_ratio_slider = gr.Slider(0, 1, value=0.3, step=0.05, label="max_ellipsis_lines_ratio")
279
- max_non_alpha_words_ratio_slider = gr.Slider(0, 1, value=0.8, step=0.05, label="max_non_alpha_words_ratio")
280
- with gr.Row():
281
- min_stop_words_slider = gr.Slider(0, 10, value=2, step=1, label="min_stop_words")
282
- stop_words_textbox = gr.Textbox("the, be, to, of, and, that, have, with", label="stop_words")
283
- stop_words_textbox.prepare_parameter = prepare_as_list_or_none
284
- gopher_filtering_quality_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=gopher_filtering_quality_checkbox, outputs=acc)
285
- gopher_filtering_quality_parameters_components = [language_dropdown2, min_doc_words_slider, max_doc_words_slider, min_avg_word_length_slider, max_avg_word_length_slider, max_symbol_word_ratio_slider, max_bullet_lines_ratio_slider, max_ellipsis_lines_ratio_slider, max_non_alpha_words_ratio_slider, min_stop_words_slider, stop_words_textbox]
286
-
287
- view_pipeline_results_button = gr.Button("Run Pipeline & Stream Results", variant="primary", scale=4)
288
-
289
- steps = [
290
- URLFilter,
291
- Trafilatura,
292
- LanguageFilter,
293
- GopherRepetitionFilter,
294
- GopherQualityFilter,
295
- C4QualityFilter,
296
- FineWebQualityFilter,
297
- PIIFormatter
298
- ]
299
- steps_parameters_components = [
300
- url_filtering_parameters_components,
301
- text_extraction_parameters_components,
302
- language_filtering_parameters_components,
303
- gopher_filtering_repetitions_parameters_components,
304
- gopher_filtering_quality_parameters_components,
305
- c4_filters_parameters_components,
306
- custom_filters_parameters_components,
307
- pii_removal_parameters_components
308
- ]
309
-
310
- with gr.Tab("Output") as output_tab:
311
- output_dataframe = gr.DataFrame(datatype="markdown")
312
- with gr.Tab("Excluded") as excluded_tab:
313
- excluded_dataframes: dict[Type, gr.DataFrame] = {}
314
- excluded_tabs: dict[Type, gr.Tab] = {}
315
- for step in steps:
316
- if issubclass(step, BaseFilter) and step is not URLFilter:
317
- with gr.Tab(step.__name__) as t:
318
- excluded_dataframes[step] = gr.DataFrame(datatype="markdown")
319
- excluded_tabs[step] = t
320
- with gr.Tab("Python code") as code_tab:
321
- python_code_markdown = gr.Markdown(build_code_snippet(steps))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
 
324
  gr.Markdown("_powered by [datatrove](https://github.com/huggingface/datatrove)_")
 
30
  DUMP_TO_PROCESS = "CC-MAIN-2023-50"
31
  TIMEOUT = 600
32
 
33
+
34
+ steps = [
35
+ URLFilter,
36
+ Trafilatura,
37
+ LanguageFilter,
38
+ GopherRepetitionFilter,
39
+ GopherQualityFilter,
40
+ C4QualityFilter,
41
+ FineWebQualityFilter,
42
+ PIIFormatter
43
+ ]
44
+
45
+ DEFAULT_CODE = dedent(
46
+ """
47
+ ```python
48
+ from datatrove.executor.local import LocalPipelineExecutor
49
+ from datatrove.pipeline.extractors import Trafilatura
50
+ from datatrove.pipeline.filters import (
51
+ C4QualityFilter,
52
+ FineWebQualityFilter,
53
+ GopherQualityFilter,
54
+ GopherRepetitionFilter,
55
+ LanguageFilter,
56
+ URLFilter,
57
+ )
58
+ from datatrove.pipeline.formatters import PIIFormatter
59
+ from datatrove.pipeline.readers import WarcReader
60
+ """
61
+ ).strip() + (
62
+ "\n\n"
63
+ "pipeline_executor = LocalPipelineExecutor(\n"
64
+ " pipeline=[\n"
65
+ f' WarcReader("s3://commoncrawl/crawl-data/{DUMP_TO_PROCESS}/segments", glob_pattern="*/warc/*"),\n'
66
+ ) + ",\n".join([
67
+ " " + step.__name__ + "()" for step in steps
68
+ ]) + (
69
+ "\n"
70
+ " ]\n"
71
+ ")"
72
+ ) + dedent(
73
+ """
74
+ pipeline_executor.run()
75
+ ```
76
+ """
77
+ )
78
+
79
  make_gallery_image_buttons_js = """
80
  function load() {
81
  class ClassWatcher {
 
143
  .grid-wrap {
144
  min-height: 0;
145
  }
146
+ .table-wrap {
147
+ min-height: 600px;
148
+ max-height: 600px;
149
+ }
150
+ .excluded_tabs .tab-wrapper .tab-container {
151
+ overflow: scroll;
152
+ }
153
  """
154
 
155
 
 
162
  def non_empty_list_or_none(input_list: list[str]) -> Optional[list[str]]:
163
  return input_list or None
164
 
 
 
 
 
 
 
 
 
165
 
166
  with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
167
  state = gr.State({"selected_block": 0})
168
  gr.Markdown("# Common Crawl Pipeline Creator")
169
+ with gr.Row():
170
+ with gr.Column():
171
+ gallery = gr.Gallery(
172
+ blocks,
173
+ columns=4,
174
+ rows=2,
175
+ label="Select step to edit",
176
+ object_fit="scale-down",
177
+ show_share_button=False,
178
+ show_download_button=False,
179
+ show_fullscreen_button=False,
180
+ elem_id="pipeline-gallery",
181
+ allow_preview=False,
182
+ )
183
+ gallery_image_buttons = [gr.Button(visible=False, elem_classes="block-button") for _ in blocks] # hack to simulate each image galery as a button, see `make_gallery_image_buttons_js``
184
+ view_pipeline_results_button = gr.Button("Run Pipeline & Stream Results", variant="primary", scale=4)
185
+ blocks_uis = []
186
+ with gr.Column(visible=False) as col:
187
+ blocks_uis.append(col)
188
+ gr.Markdown("## 1. URL Filtering \n\nPerforms filtering based on samples urls.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  with gr.Group():
190
+ url_filtering_checkbox = gr.Checkbox(True, label="Enable")
191
+ with gr.Accordion("Parameters", open=True) as acc:
192
+ use_integrated_lists_checkbox = gr.Checkbox(True, label="use_integrated_lists", info="use the datatrove integrated lists of banned urls and words")
193
+ with gr.Row():
194
+ with gr.Column():
195
+ extra_domain_textbox = gr.Textbox("", label="extra_domains", info="remove if the domain is present in `extra_domains`")
196
+ extra_domain_textbox.prepare_parameter = prepare_as_list_or_none
197
+ extra_urls_textbox = gr.Textbox("", label="extra_urls", info="remove if the full url is present on `extra_urls`")
198
+ extra_urls_textbox.prepare_parameter = prepare_as_list_or_none
199
+ with gr.Column():
200
+ banned_words_textbox = gr.Textbox("", label="banned_words", info="remove if any word from `banned_words` is in the url")
201
+ banned_words_textbox.prepare_parameter = prepare_as_list_or_none
202
+ banned_subwords_textbox = gr.Textbox("", label="banned_subwords", info="remove if any word from `banned_subwords` is a substring of the url")
203
+ banned_subwords_textbox.prepare_parameter = prepare_as_list_or_none
204
+ with gr.Column():
205
+ soft_banned_words_textbox = gr.Textbox("", label="soft_banned_words", info="remove if there are at least `soft_word_threshold` words from `soft_banned_words` in the url")
206
+ soft_banned_words_textbox.prepare_parameter = prepare_as_list_or_none
207
+ soft_word_threshold_slider = gr.Slider(0, 5, value=2, step=1, label="soft_word_threshold", info="remove if there are at least `soft_word_threshold` words from `soft_banned_words` in the url")
208
+ url_filtering_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=url_filtering_checkbox, outputs=acc)
209
+ url_filtering_parameters_components = [use_integrated_lists_checkbox, extra_domain_textbox, extra_urls_textbox, banned_words_textbox, banned_subwords_textbox, soft_banned_words_textbox, soft_word_threshold_slider]
210
+ with gr.Column(visible=False) as col:
211
+ blocks_uis.append(col)
212
+ gr.Markdown("## 2. Text Extraction \n\nUses the [Trafilatura](https://trafilatura.readthedocs.io) extractor.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  with gr.Group():
214
+ text_extraction_checkbox = gr.Checkbox(True, label="Enable")
215
+ with gr.Accordion("Parameters", open=True) as acc:
216
+ with gr.Row():
217
+ favour_precision_checkbox = gr.Checkbox(True, label="favour_precision", info="prefer less text but correct extraction")
218
+ timeout_slider = gr.Slider(0.05, 0.5, value=0.1, step=0.05, label="timeout", info="the timeout for extraction, per document, in seconds")
219
+ deduplicate_checkbox = gr.Checkbox(True, label="deduplicate", info="trafilatura's deduplicate option")
220
+ text_extraction_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=text_extraction_checkbox, outputs=acc)
221
+ text_extraction_parameters_components = [favour_precision_checkbox, timeout_slider, deduplicate_checkbox]
222
+ with gr.Column(visible=False) as col:
223
+ blocks_uis.append(col)
224
+ gr.Markdown("## 3. Language Filtering \n\nUses the [fastext](https://fasttext.cc/docs/en/language-identification.html) language identification models.")
 
 
 
 
 
 
 
 
 
 
 
225
  with gr.Group():
226
+ language_filtering_checkbox = gr.Checkbox(True, label="Enable")
227
+ with gr.Accordion("Parameters", open=True) as acc:
228
+ with gr.Row():
229
+ languages_textbox = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), multiselect=True, label="languages", info="list of languages to keep. empty for all")
230
+ languages_textbox.prepare_parameter = non_empty_list_or_none
231
+ language_threshold_slider = gr.Slider(0, 1, value=0.65, step=0.05, label="language_threshold", info="minimum score to accept a document")
232
+ language_filtering_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=language_filtering_checkbox, outputs=acc)
233
+ language_filtering_parameters_components = [languages_textbox, language_threshold_slider]
234
+ with gr.Column(visible=False) as col:
235
+ blocks_uis.append(col)
236
+ gr.Markdown("## 4. Gopher Filtering (repetitions) \n\nUses the [Gopher](https://huggingface.co/papers/2112.11446) text repetition filters.")
237
+ with gr.Group():
238
+ gopher_filtering_repetitions_checkbox = gr.Checkbox(True, label="Enable")
239
+ with gr.Accordion("Parameters", open=True) as acc:
240
+ with gr.Group():
241
+ with gr.Row():
242
+ language_dropdown1 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
243
+ top_n_grams_textbox = gr.Textbox("(2, 0.2), (3, 0.18), (4, 0.16)", label="top_n_grams")
244
+ top_n_grams_textbox.prepare_parameter = ast.literal_eval
245
+ dup_n_grams_textbox = gr.Textbox("(5, 0.15), (6, 0.14), (7, 0.13), (8, 0.12), (9, 0.11), (10, 0.10)", label="dup_n_grams")
246
+ dup_n_grams_textbox.prepare_parameter = ast.literal_eval
247
+ with gr.Row():
248
+ dup_line_frac_slider = gr.Slider(0, 1, value=0.3, step=0.05, label="dup_line_frac")
249
+ dup_para_frac_slider = gr.Slider(0, 1, value=0.3, step=0.05, label="dup_para_frac")
250
+ dup_line_char_frac_slider = gr.Slider(0, 1, value=0.2, step=0.05, label="dup_line_char_frac")
251
+ dup_para_char_frac_slider = gr.Slider(0, 1, value=0.2, step=0.05, label="dup_para_char_frac")
252
+ gopher_filtering_repetitions_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=gopher_filtering_repetitions_checkbox, outputs=acc)
253
+ gopher_filtering_repetitions_parameters_components = [language_dropdown1, top_n_grams_textbox, dup_n_grams_textbox, dup_line_frac_slider, dup_para_frac_slider, dup_line_char_frac_slider, dup_para_char_frac_slider]
254
+ with gr.Column(visible=False) as col:
255
+ blocks_uis.append(col)
256
+ gr.Markdown("## 8. PII Removal \n\nReplaces email addresses and ip addresses in the document text.")
257
+ with gr.Group():
258
+ pii_removal_checkbox = gr.Checkbox(True, label="Enable")
259
+ with gr.Accordion("Parameters", open=True) as acc:
260
+ with gr.Row():
261
+ remove_emails_checkbox = gr.Checkbox(True, label="remove_emails", info="Replace email addresses")
262
+ remove_ips_checkbox = gr.Checkbox(True, label="remove_ips", info="Replace IP addresses")
263
+ only_remove_public_ips_checkbox = gr.Checkbox(True, label="only_remove_public_ips", info="by default we only replace public (and thus PII) IPs")
264
+ with gr.Row():
265
+ email_replacement_textbox = gr.Textbox("[email protected], [email protected]", label="email_replacement", info="strings to use as replacement. They will be used in a circular way")
266
+ email_replacement_textbox.prepare_parameter = prepare_as_list_or_none
267
+ ip_replacement_textbox = gr.Textbox("22.214.171.124, 126.96.36.199, 188.8.131.52, 184.108.40.206, 220.127.116.11, 18.104.22.168", label="ip_replacement", info="same as email_replacement but for IP addresses")
268
+ ip_replacement_textbox.prepare_parameter = prepare_as_list_or_none
269
+ pii_removal_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=pii_removal_checkbox, outputs=acc)
270
+ pii_removal_parameters_components = [remove_emails_checkbox, remove_ips_checkbox, only_remove_public_ips_checkbox, email_replacement_textbox, ip_replacement_textbox]
271
+ with gr.Column(visible=False) as col:
272
+ blocks_uis.append(col)
273
+ gr.Markdown("## 7. Custom Filters \n\nUses the [FineWeb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) custom text filters.")
274
+ with gr.Group():
275
+ custom_filters_checkbox = gr.Checkbox(True, label="Enable")
276
+ with gr.Accordion("Parameters", open=True) as acc:
277
+ with gr.Row():
278
+ line_punct_thr_slider = gr.Slider(0, 1, value=0.12, step=0.01, label="line_punct_thr")
279
+ line_punct_exclude_zero = gr.Checkbox(False, label="line_punct_exclude_zero")
280
+ short_line_thr_slider = gr.Slider(0, 1, value=0.67, step=0.01, label="short_line_thr")
281
+ short_line_length_slider = gr.Slider(0, 100, value=30, step=1, label="short_line_length")
282
+ char_duplicates_ratio_slider = gr.Slider(0, 1, value=0.01, step=0.01, label="char_duplicates_ratio")
283
+ new_line_ratio_slider = gr.Slider(0, 1, value=0.3, step=0.01, label="new_line_ratio")
284
+ custom_filters_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=custom_filters_checkbox, outputs=acc)
285
+ custom_filters_parameters_components = [line_punct_thr_slider, line_punct_exclude_zero, short_line_thr_slider, short_line_length_slider, char_duplicates_ratio_slider, new_line_ratio_slider]
286
+ with gr.Column(visible=False) as col:
287
+ blocks_uis.append(col)
288
+ gr.Markdown("## 6. C4 Filters\n\nUses the [C4](https://huggingface.co/datasets/allenai/c4) text size and content filters.")
289
+ with gr.Group():
290
+ c4_filters_checkbox = gr.Checkbox(True, label="Enable")
291
+ with gr.Accordion(" Parameters", open=True) as acc:
292
+ with gr.Group():
293
+ with gr.Row():
294
+ split_paragraph_checkbox = gr.Checkbox(True, label="split_paragraph", info="disable to apply the filters to each sentence instead of to each line")
295
+ with gr.Row():
296
+ language_dropdown2 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
297
+ min_num_sentences_slider = gr.Slider(0, 10, value=5, step=1, label="min_num_sentences", info="remove documents that do not have at least this number of sentences (after line filtering)")
298
+ min_words_per_line_slider = gr.Slider(0, 10, value=3, step=1, label="min_words_per_line", info="drop lines without this min number of words")
299
+ max_word_length_slider = gr.Slider(0, 2000, value=1000, step=10, label="max_word_length", info=" drop lines where at least one word has more than this number of characters")
300
+ with gr.Row():
301
+ remove_citations_checkbox = gr.Checkbox(True, label="remove_citations", info="remove wikipedia style citations from the text")
302
+ filter_no_terminal_punct_checkbox = gr.Checkbox(True, label="filter_no_terminal_punct", info="remove lines without terminal punctuation marks")
303
+ filter_lorem_ipsum_checkbox = gr.Checkbox(True, label="filter_lorem_ipsum", info="drop documents that contain 'lorem ipsum'")
304
+ filter_javascript_checkbox = gr.Checkbox(True, label="filter_javascript", info="drop lines mentioning 'javascript'")
305
+ filter_curly_bracket = gr.Checkbox(True, label="filter_curly_bracket", info="drop documents containing {")
306
+ filter_policy = gr.Checkbox(True, label="filter_policy", info="drop lines containing any of the policy phrases (e.g. 'terms of use', 'use cookies')")
307
+ c4_filters_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=c4_filters_checkbox, outputs=acc)
308
+ c4_filters_parameters_components = [split_paragraph_checkbox, language_dropdown2, min_num_sentences_slider, min_words_per_line_slider, max_word_length_slider, remove_citations_checkbox, filter_no_terminal_punct_checkbox, filter_lorem_ipsum_checkbox, filter_javascript_checkbox, filter_curly_bracket, filter_policy]
309
+ with gr.Column(visible=False) as col:
310
+ blocks_uis.append(col)
311
+ gr.Markdown("## 5. Gopher Filtering (quality) \n\nUses the [Gopher](https://huggingface.co/papers/2112.11446) text quality filters.")
312
+ with gr.Group():
313
+ gopher_filtering_quality_checkbox = gr.Checkbox(True, label="Enable")
314
+ with gr.Accordion("Parameters", open=True) as acc:
315
+ with gr.Group():
316
+ with gr.Row():
317
+ language_dropdown2 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
318
+ min_doc_words_slider = gr.Slider(0, 1000, value=50, step=10, label="min_doc_words")
319
+ max_doc_words_slider = gr.Slider(0, 200_000, value=100_000, step=10_000, label="max_doc_words")
320
+ with gr.Row():
321
+ min_avg_word_length_slider = gr.Slider(0, 20, value=3, step=1, label="min_avg_word_length")
322
+ max_avg_word_length_slider = gr.Slider(0, 20, value=10, step=1, label="max_avg_word_length")
323
+ with gr.Row():
324
+ max_symbol_word_ratio_slider = gr.Slider(0, 1, value=0.1, step=0.05, label="max_symbol_word_ratio")
325
+ max_bullet_lines_ratio_slider = gr.Slider(0, 1, value=0.9, step=0.05, label="max_bullet_lines_ratio")
326
+ max_ellipsis_lines_ratio_slider = gr.Slider(0, 1, value=0.3, step=0.05, label="max_ellipsis_lines_ratio")
327
+ max_non_alpha_words_ratio_slider = gr.Slider(0, 1, value=0.8, step=0.05, label="max_non_alpha_words_ratio")
328
+ with gr.Row():
329
+ min_stop_words_slider = gr.Slider(0, 10, value=2, step=1, label="min_stop_words")
330
+ stop_words_textbox = gr.Textbox("the, be, to, of, and, that, have, with", label="stop_words")
331
+ stop_words_textbox.prepare_parameter = prepare_as_list_or_none
332
+ gopher_filtering_quality_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=gopher_filtering_quality_checkbox, outputs=acc)
333
+ gopher_filtering_quality_parameters_components = [language_dropdown2, min_doc_words_slider, max_doc_words_slider, min_avg_word_length_slider, max_avg_word_length_slider, max_symbol_word_ratio_slider, max_bullet_lines_ratio_slider, max_ellipsis_lines_ratio_slider, max_non_alpha_words_ratio_slider, min_stop_words_slider, stop_words_textbox]
334
+
335
+ steps_parameters_components = [
336
+ url_filtering_parameters_components,
337
+ text_extraction_parameters_components,
338
+ language_filtering_parameters_components,
339
+ gopher_filtering_repetitions_parameters_components,
340
+ gopher_filtering_quality_parameters_components,
341
+ c4_filters_parameters_components,
342
+ custom_filters_parameters_components,
343
+ pii_removal_parameters_components
344
+ ]
345
+
346
+ with gr.Column():
347
+ with gr.Tab("Output") as output_tab:
348
+ output_dataframe = gr.DataFrame(datatype="markdown")
349
+ with gr.Tab("Excluded") as excluded_tab:
350
+ with gr.Tabs(elem_classes="excluded_tabs"):
351
+ excluded_dataframes: dict[Type, gr.DataFrame] = {}
352
+ excluded_tabs: dict[Type, gr.Tab] = {}
353
+ for step in steps:
354
+ if issubclass(step, BaseFilter) and step is not URLFilter:
355
+ with gr.Tab(step.__name__) as t:
356
+ excluded_dataframes[step] = gr.DataFrame(datatype="markdown")
357
+ excluded_tabs[step] = t
358
+ with gr.Tab("Python code") as code_tab:
359
+ python_code_markdown = gr.Markdown(DEFAULT_CODE)
360
 
361
 
362
  gr.Markdown("_powered by [datatrove](https://github.com/huggingface/datatrove)_")