omkarenator commited on
Commit
61deef0
·
1 Parent(s): 370837e

all data views in

Browse files
Files changed (1) hide show
  1. web.py +39 -51
web.py CHANGED
@@ -9,7 +9,7 @@ from data.url_blocklist import urls_high_matches, urls_false_positives
9
  from data.non_web_urls import non_web_urls
10
 
11
 
12
- def view_data_static(
13
  left,
14
  header,
15
  ):
@@ -28,7 +28,7 @@ def view_data_static(
28
  return Div(H3(header), data_display, style="margin-top: 10px;")
29
 
30
 
31
- def view_data(
32
  left_file,
33
  doc_id,
34
  header,
@@ -79,7 +79,7 @@ def view_data(
79
  return Div(form, data_display, style="margin-top: 10px;", id=target)
80
 
81
 
82
- def view_data_2col(
83
  left_file,
84
  right_file,
85
  doc_id,
@@ -149,7 +149,7 @@ def update(target: str, request):
149
  right_file = params.get("right_file")
150
  if left_file and right_file:
151
  return (
152
- view_data_2col(
153
  left_file,
154
  right_file,
155
  doc_id,
@@ -157,7 +157,7 @@ def update(target: str, request):
157
  ),
158
  )
159
  else:
160
- return view_data(
161
  left_file,
162
  doc_id,
163
  params.get("header"),
@@ -206,18 +206,18 @@ def web_data():
206
  we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
207
  Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
208
  """),
209
- view_data_2col("data/sample_wet.json", "data/sample_warc.json", 3),
210
  H4("1.2 Language Identification"),
211
  P("""
212
  After text extraction, the non-English texts are then filtered out by fastText language identifier with a threshold of 0.65.
213
  This step removes over 60% of the whole data.
214
  """),
215
- view_data(
216
  "data/sample_non_en.json",
217
  3,
218
  "Sample documents that are classified as non-English",
219
  ),
220
- view_data(
221
  "data/sample_en_low.json",
222
  3,
223
  "Sample documents that are classified as English but with score less than 0.65",
@@ -233,14 +233,12 @@ def web_data():
233
  articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
234
  4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
235
  """),
236
- view_data_static(urls_high_matches, "24 URL domains with more than 4k matches"),
237
  P("""
238
  We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
239
  """),
240
- view_data_static(
241
- urls_false_positives, "6 url domains that are removed from the blocklist"
242
- ),
243
- view_data(
244
  "data/bad_url_doc.jsonl",
245
  3,
246
  "Sample documents whose urls are blocked by the refined url blocklist",
@@ -249,11 +247,11 @@ def web_data():
249
  P("""
250
  To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
251
  """),
252
- view_data_static(
253
  non_web_urls,
254
  "curated url domains that are excluded from our dataset",
255
  ),
256
- view_data(
257
  "data/sample_url_exclusion.json",
258
  0,
259
  "Sample documents whose urls are in our curated url domain list",
@@ -272,7 +270,7 @@ def web_data():
272
  of 56,292 additional lines, resulting in the complete exclusion of 2,203 documents from a total of 13,560
273
  documents (16.25%). Accordingly, we choose to not use terminal punctuation as a signal to remove lines.
274
  """),
275
- view_data(
276
  "data/sample_terminal_punc.json",
277
  0,
278
  "Sample documents with lines that are removed by the rule of terminal punctuation",
@@ -285,7 +283,7 @@ def web_data():
285
  propose to refine the strategy by adding one more keyword to the word "javascript" to avoid false positives.
286
  The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
287
  """),
288
- view_data(
289
  "data/sample_java.jsonl",
290
  0,
291
  "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
@@ -298,7 +296,7 @@ def web_data():
298
  - The line matches the pattern “r'^\\d+\\s+likes$'”,
299
  - The line contains only one word.
300
  """),
301
- view_data(
302
  "data/sample_refinedweb_line.json",
303
  0,
304
  "Sample documents with lines that are removed by the RefinedWeb rules",
@@ -311,7 +309,7 @@ def web_data():
311
  line is in the first 3 lines or in the last 3 lines) to remove toxic lines. Specifically, we do not only consider
312
  the bad words from English but also consider the bad words from other languages.
313
  """),
314
- view_data_static(
315
  json.load(open("data/toxic_lines.json")),
316
  "Sample documents with toxic lines",
317
  ),
@@ -319,7 +317,7 @@ def web_data():
319
  P("""
320
  In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
321
  Overview of all the quality signals that are used for filtering."""),
322
- view_data_static(
323
  json.load(open("data/all_signals.json")),
324
  "Overview of all the quality signals that are used for filtering",
325
  ),
@@ -368,9 +366,10 @@ def web_data():
368
  ensures consistency with the overall document character count calculation.
369
  """),
370
  H5("Our Implementation"),
371
- Img(
372
- src="path/to/sample_filtered_lines.png",
373
- alt="Sample documents filtered by excessive line repetitions / characters in repeated lines",
 
374
  ),
375
  H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
376
  P("""
@@ -394,9 +393,10 @@ def web_data():
394
  only once — tend to be short.
395
  """),
396
  H5("Our Implementations"),
397
- Img(
398
- src="path/to/sample_common_ngrams.png",
399
- alt="Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
 
400
  ),
401
  H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
402
  P("""
@@ -423,18 +423,15 @@ def web_data():
423
  We decided to use the RedPajama V2 implementation but skip the 1st occurrence of the duplicate n-gram.
424
  """),
425
  H5("Our Implementations"),
426
- Img(
427
- src="path/to/sample_dup_ngrams.png",
428
- alt="Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
429
- ),
430
  H5("An Example to Show the Difference Between Above Implementations"),
431
  P("..."), # Add specific examples if available
432
  H5(
433
  "Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
434
  ),
435
- Img(
436
- src="path/to/sample_dup_ngrams_filtered.png",
437
- alt="Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
 
438
  ),
439
  H4("3.2 Line-wise Heuristics"),
440
  P("""
@@ -443,9 +440,10 @@ def web_data():
443
  works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
444
  90% of lines start with a bullet point.
445
  """),
446
- Img(
447
- src="path/to/sample_line_weirdness_removed.png",
448
- alt="Sample documents that are filtered out by line-wise heuristics",
 
449
  ),
450
  H4("3.3 Statistics-based Heuristics"),
451
  P("""
@@ -505,10 +503,6 @@ median_word_length = median(len(word) for word in words)
505
  The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
506
  to split text into sentences.
507
  """),
508
- Img(
509
- src="path/to/sample_sentences_split.png",
510
- alt="Sample documents split into sentences",
511
- ),
512
  P("""
513
  However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
514
  we opted to use `nltk.tokenize.sent_tokenize` for more accurate sentence splitting.
@@ -522,10 +516,6 @@ median_word_length = median(len(word) for word in words)
522
  Following RedPajama-V2 and DataTrove, we use the symbols of ("#", "...", "…").
523
  We calculate the ratio as the number of symbols divided by the total number of words.
524
  """),
525
- Img(
526
- src="path/to/sample_symbol_word_ratio.png",
527
- alt="Sample documents filtered by symbol-to-word ratio",
528
- ),
529
  H5("Fraction of Alphabetic Words"),
530
  P("""
531
  Implementations from Dolma
@@ -549,19 +539,17 @@ median_word_length = median(len(word) for word in words)
549
  alt="Sample documents filtered by number of stop words",
550
  ),
551
  H5("Our Implementations"),
552
- Img(
553
- src="path/to/sample_statistics_based_filters.png",
554
- alt="Sample documents that are filtered out by statistics-based heuristics",
 
555
  ),
556
  H4("3.4 Others"),
557
  P("""
558
  Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
559
  text.
560
  """),
561
- Img(
562
- src="path/to/sample_lorem_ipsum.png",
563
- alt="Sample documents containing 'lorem ipsum'",
564
- ),
565
  H3("4. Deduplication"),
566
  P("..."), # Add detailed content and images as needed
567
  H3("5. PII Removal"),
 
9
  from data.non_web_urls import non_web_urls
10
 
11
 
12
+ def DVS(
13
  left,
14
  header,
15
  ):
 
28
  return Div(H3(header), data_display, style="margin-top: 10px;")
29
 
30
 
31
+ def DV(
32
  left_file,
33
  doc_id,
34
  header,
 
79
  return Div(form, data_display, style="margin-top: 10px;", id=target)
80
 
81
 
82
+ def DV2(
83
  left_file,
84
  right_file,
85
  doc_id,
 
149
  right_file = params.get("right_file")
150
  if left_file and right_file:
151
  return (
152
+ DV2(
153
  left_file,
154
  right_file,
155
  doc_id,
 
157
  ),
158
  )
159
  else:
160
+ return DV(
161
  left_file,
162
  doc_id,
163
  params.get("header"),
 
206
  we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
207
  Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
208
  """),
209
+ DV2("data/sample_wet.json", "data/sample_warc.json", 3),
210
  H4("1.2 Language Identification"),
211
  P("""
212
  After text extraction, the non-English texts are then filtered out by fastText language identifier with a threshold of 0.65.
213
  This step removes over 60% of the whole data.
214
  """),
215
+ DV(
216
  "data/sample_non_en.json",
217
  3,
218
  "Sample documents that are classified as non-English",
219
  ),
220
+ DV(
221
  "data/sample_en_low.json",
222
  3,
223
  "Sample documents that are classified as English but with score less than 0.65",
 
233
  articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
234
  4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
235
  """),
236
+ DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
237
  P("""
238
  We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
239
  """),
240
+ DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
241
+ DV(
 
 
242
  "data/bad_url_doc.jsonl",
243
  3,
244
  "Sample documents whose urls are blocked by the refined url blocklist",
 
247
  P("""
248
  To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
249
  """),
250
+ DVS(
251
  non_web_urls,
252
  "curated url domains that are excluded from our dataset",
253
  ),
254
+ DV(
255
  "data/sample_url_exclusion.json",
256
  0,
257
  "Sample documents whose urls are in our curated url domain list",
 
270
  of 56,292 additional lines, resulting in the complete exclusion of 2,203 documents from a total of 13,560
271
  documents (16.25%). Accordingly, we choose to not use terminal punctuation as a signal to remove lines.
272
  """),
273
+ DV(
274
  "data/sample_terminal_punc.json",
275
  0,
276
  "Sample documents with lines that are removed by the rule of terminal punctuation",
 
283
  propose to refine the strategy by adding one more keyword to the word "javascript" to avoid false positives.
284
  The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
285
  """),
286
+ DV(
287
  "data/sample_java.jsonl",
288
  0,
289
  "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
 
296
  - The line matches the pattern “r'^\\d+\\s+likes$'”,
297
  - The line contains only one word.
298
  """),
299
+ DV(
300
  "data/sample_refinedweb_line.json",
301
  0,
302
  "Sample documents with lines that are removed by the RefinedWeb rules",
 
309
  line is in the first 3 lines or in the last 3 lines) to remove toxic lines. Specifically, we do not only consider
310
  the bad words from English but also consider the bad words from other languages.
311
  """),
312
+ DVS(
313
  json.load(open("data/toxic_lines.json")),
314
  "Sample documents with toxic lines",
315
  ),
 
317
  P("""
318
  In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
319
  Overview of all the quality signals that are used for filtering."""),
320
+ DVS(
321
  json.load(open("data/all_signals.json")),
322
  "Overview of all the quality signals that are used for filtering",
323
  ),
 
366
  ensures consistency with the overall document character count calculation.
367
  """),
368
  H5("Our Implementation"),
369
+ DV(
370
+ "data/repeat_line_frac.jsonl",
371
+ 0,
372
+ "Sample documents filtered by excessive line repetitions / characters in repeated lines",
373
  ),
374
  H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
375
  P("""
 
393
  only once — tend to be short.
394
  """),
395
  H5("Our Implementations"),
396
+ DV(
397
+ "data/sample_top_ngram.json",
398
+ 0,
399
+ "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
400
  ),
401
  H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
402
  P("""
 
423
  We decided to use the RedPajama V2 implementation but skip the 1st occurrence of the duplicate n-gram.
424
  """),
425
  H5("Our Implementations"),
 
 
 
 
426
  H5("An Example to Show the Difference Between Above Implementations"),
427
  P("..."), # Add specific examples if available
428
  H5(
429
  "Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
430
  ),
431
+ DV(
432
+ "data/sample_dup_ngram.json",
433
+ 0,
434
+ "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
435
  ),
436
  H4("3.2 Line-wise Heuristics"),
437
  P("""
 
440
  works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
441
  90% of lines start with a bullet point.
442
  """),
443
+ DV(
444
+ "data/line_info.json",
445
+ 0,
446
+ "Sample documents that are filtered out by line-wise heuristics",
447
  ),
448
  H4("3.3 Statistics-based Heuristics"),
449
  P("""
 
503
  The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
504
  to split text into sentences.
505
  """),
 
 
 
 
506
  P("""
507
  However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
508
  we opted to use `nltk.tokenize.sent_tokenize` for more accurate sentence splitting.
 
516
  Following RedPajama-V2 and DataTrove, we use the symbols of ("#", "...", "…").
517
  We calculate the ratio as the number of symbols divided by the total number of words.
518
  """),
 
 
 
 
519
  H5("Fraction of Alphabetic Words"),
520
  P("""
521
  Implementations from Dolma
 
539
  alt="Sample documents filtered by number of stop words",
540
  ),
541
  H5("Our Implementations"),
542
+ DV(
543
+ "data/sample_doc_stat.json",
544
+ 0,
545
+ "Sample documents that are filtered out by statistics-based heuristics",
546
  ),
547
  H4("3.4 Others"),
548
  P("""
549
  Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
550
  text.
551
  """),
552
+ DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum"),
 
 
 
553
  H3("4. Deduplication"),
554
  P("..."), # Add detailed content and images as needed
555
  H3("5. PII Removal"),