Spaces:
Sleeping
Sleeping
omkarenator
commited on
Commit
·
61deef0
1
Parent(s):
370837e
all data views in
Browse files
web.py
CHANGED
@@ -9,7 +9,7 @@ from data.url_blocklist import urls_high_matches, urls_false_positives
|
|
9 |
from data.non_web_urls import non_web_urls
|
10 |
|
11 |
|
12 |
-
def
|
13 |
left,
|
14 |
header,
|
15 |
):
|
@@ -28,7 +28,7 @@ def view_data_static(
|
|
28 |
return Div(H3(header), data_display, style="margin-top: 10px;")
|
29 |
|
30 |
|
31 |
-
def
|
32 |
left_file,
|
33 |
doc_id,
|
34 |
header,
|
@@ -79,7 +79,7 @@ def view_data(
|
|
79 |
return Div(form, data_display, style="margin-top: 10px;", id=target)
|
80 |
|
81 |
|
82 |
-
def
|
83 |
left_file,
|
84 |
right_file,
|
85 |
doc_id,
|
@@ -149,7 +149,7 @@ def update(target: str, request):
|
|
149 |
right_file = params.get("right_file")
|
150 |
if left_file and right_file:
|
151 |
return (
|
152 |
-
|
153 |
left_file,
|
154 |
right_file,
|
155 |
doc_id,
|
@@ -157,7 +157,7 @@ def update(target: str, request):
|
|
157 |
),
|
158 |
)
|
159 |
else:
|
160 |
-
return
|
161 |
left_file,
|
162 |
doc_id,
|
163 |
params.get("header"),
|
@@ -206,18 +206,18 @@ def web_data():
|
|
206 |
we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
|
207 |
Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
|
208 |
"""),
|
209 |
-
|
210 |
H4("1.2 Language Identification"),
|
211 |
P("""
|
212 |
After text extraction, the non-English texts are then filtered out by fastText language identifier with a threshold of 0.65.
|
213 |
This step removes over 60% of the whole data.
|
214 |
"""),
|
215 |
-
|
216 |
"data/sample_non_en.json",
|
217 |
3,
|
218 |
"Sample documents that are classified as non-English",
|
219 |
),
|
220 |
-
|
221 |
"data/sample_en_low.json",
|
222 |
3,
|
223 |
"Sample documents that are classified as English but with score less than 0.65",
|
@@ -233,14 +233,12 @@ def web_data():
|
|
233 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
234 |
4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
|
235 |
"""),
|
236 |
-
|
237 |
P("""
|
238 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
239 |
"""),
|
240 |
-
|
241 |
-
|
242 |
-
),
|
243 |
-
view_data(
|
244 |
"data/bad_url_doc.jsonl",
|
245 |
3,
|
246 |
"Sample documents whose urls are blocked by the refined url blocklist",
|
@@ -249,11 +247,11 @@ def web_data():
|
|
249 |
P("""
|
250 |
To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
|
251 |
"""),
|
252 |
-
|
253 |
non_web_urls,
|
254 |
"curated url domains that are excluded from our dataset",
|
255 |
),
|
256 |
-
|
257 |
"data/sample_url_exclusion.json",
|
258 |
0,
|
259 |
"Sample documents whose urls are in our curated url domain list",
|
@@ -272,7 +270,7 @@ def web_data():
|
|
272 |
of 56,292 additional lines, resulting in the complete exclusion of 2,203 documents from a total of 13,560
|
273 |
documents (16.25%). Accordingly, we choose to not use terminal punctuation as a signal to remove lines.
|
274 |
"""),
|
275 |
-
|
276 |
"data/sample_terminal_punc.json",
|
277 |
0,
|
278 |
"Sample documents with lines that are removed by the rule of terminal punctuation",
|
@@ -285,7 +283,7 @@ def web_data():
|
|
285 |
propose to refine the strategy by adding one more keyword to the word "javascript" to avoid false positives.
|
286 |
The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
|
287 |
"""),
|
288 |
-
|
289 |
"data/sample_java.jsonl",
|
290 |
0,
|
291 |
"Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
|
@@ -298,7 +296,7 @@ def web_data():
|
|
298 |
- The line matches the pattern “r'^\\d+\\s+likes$'”,
|
299 |
- The line contains only one word.
|
300 |
"""),
|
301 |
-
|
302 |
"data/sample_refinedweb_line.json",
|
303 |
0,
|
304 |
"Sample documents with lines that are removed by the RefinedWeb rules",
|
@@ -311,7 +309,7 @@ def web_data():
|
|
311 |
line is in the first 3 lines or in the last 3 lines) to remove toxic lines. Specifically, we do not only consider
|
312 |
the bad words from English but also consider the bad words from other languages.
|
313 |
"""),
|
314 |
-
|
315 |
json.load(open("data/toxic_lines.json")),
|
316 |
"Sample documents with toxic lines",
|
317 |
),
|
@@ -319,7 +317,7 @@ def web_data():
|
|
319 |
P("""
|
320 |
In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
|
321 |
Overview of all the quality signals that are used for filtering."""),
|
322 |
-
|
323 |
json.load(open("data/all_signals.json")),
|
324 |
"Overview of all the quality signals that are used for filtering",
|
325 |
),
|
@@ -368,9 +366,10 @@ def web_data():
|
|
368 |
ensures consistency with the overall document character count calculation.
|
369 |
"""),
|
370 |
H5("Our Implementation"),
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
374 |
),
|
375 |
H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
376 |
P("""
|
@@ -394,9 +393,10 @@ def web_data():
|
|
394 |
only once — tend to be short.
|
395 |
"""),
|
396 |
H5("Our Implementations"),
|
397 |
-
|
398 |
-
|
399 |
-
|
|
|
400 |
),
|
401 |
H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
402 |
P("""
|
@@ -423,18 +423,15 @@ def web_data():
|
|
423 |
We decided to use the RedPajama V2 implementation but skip the 1st occurrence of the duplicate n-gram.
|
424 |
"""),
|
425 |
H5("Our Implementations"),
|
426 |
-
Img(
|
427 |
-
src="path/to/sample_dup_ngrams.png",
|
428 |
-
alt="Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
|
429 |
-
),
|
430 |
H5("An Example to Show the Difference Between Above Implementations"),
|
431 |
P("..."), # Add specific examples if available
|
432 |
H5(
|
433 |
"Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
|
434 |
),
|
435 |
-
|
436 |
-
|
437 |
-
|
|
|
438 |
),
|
439 |
H4("3.2 Line-wise Heuristics"),
|
440 |
P("""
|
@@ -443,9 +440,10 @@ def web_data():
|
|
443 |
works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
|
444 |
90% of lines start with a bullet point.
|
445 |
"""),
|
446 |
-
|
447 |
-
|
448 |
-
|
|
|
449 |
),
|
450 |
H4("3.3 Statistics-based Heuristics"),
|
451 |
P("""
|
@@ -505,10 +503,6 @@ median_word_length = median(len(word) for word in words)
|
|
505 |
The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
|
506 |
to split text into sentences.
|
507 |
"""),
|
508 |
-
Img(
|
509 |
-
src="path/to/sample_sentences_split.png",
|
510 |
-
alt="Sample documents split into sentences",
|
511 |
-
),
|
512 |
P("""
|
513 |
However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
|
514 |
we opted to use `nltk.tokenize.sent_tokenize` for more accurate sentence splitting.
|
@@ -522,10 +516,6 @@ median_word_length = median(len(word) for word in words)
|
|
522 |
Following RedPajama-V2 and DataTrove, we use the symbols of ("#", "...", "…").
|
523 |
We calculate the ratio as the number of symbols divided by the total number of words.
|
524 |
"""),
|
525 |
-
Img(
|
526 |
-
src="path/to/sample_symbol_word_ratio.png",
|
527 |
-
alt="Sample documents filtered by symbol-to-word ratio",
|
528 |
-
),
|
529 |
H5("Fraction of Alphabetic Words"),
|
530 |
P("""
|
531 |
Implementations from Dolma
|
@@ -549,19 +539,17 @@ median_word_length = median(len(word) for word in words)
|
|
549 |
alt="Sample documents filtered by number of stop words",
|
550 |
),
|
551 |
H5("Our Implementations"),
|
552 |
-
|
553 |
-
|
554 |
-
|
|
|
555 |
),
|
556 |
H4("3.4 Others"),
|
557 |
P("""
|
558 |
Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
|
559 |
text.
|
560 |
"""),
|
561 |
-
|
562 |
-
src="path/to/sample_lorem_ipsum.png",
|
563 |
-
alt="Sample documents containing 'lorem ipsum'",
|
564 |
-
),
|
565 |
H3("4. Deduplication"),
|
566 |
P("..."), # Add detailed content and images as needed
|
567 |
H3("5. PII Removal"),
|
|
|
9 |
from data.non_web_urls import non_web_urls
|
10 |
|
11 |
|
12 |
+
def DVS(
|
13 |
left,
|
14 |
header,
|
15 |
):
|
|
|
28 |
return Div(H3(header), data_display, style="margin-top: 10px;")
|
29 |
|
30 |
|
31 |
+
def DV(
|
32 |
left_file,
|
33 |
doc_id,
|
34 |
header,
|
|
|
79 |
return Div(form, data_display, style="margin-top: 10px;", id=target)
|
80 |
|
81 |
|
82 |
+
def DV2(
|
83 |
left_file,
|
84 |
right_file,
|
85 |
doc_id,
|
|
|
149 |
right_file = params.get("right_file")
|
150 |
if left_file and right_file:
|
151 |
return (
|
152 |
+
DV2(
|
153 |
left_file,
|
154 |
right_file,
|
155 |
doc_id,
|
|
|
157 |
),
|
158 |
)
|
159 |
else:
|
160 |
+
return DV(
|
161 |
left_file,
|
162 |
doc_id,
|
163 |
params.get("header"),
|
|
|
206 |
we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
|
207 |
Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
|
208 |
"""),
|
209 |
+
DV2("data/sample_wet.json", "data/sample_warc.json", 3),
|
210 |
H4("1.2 Language Identification"),
|
211 |
P("""
|
212 |
After text extraction, the non-English texts are then filtered out by fastText language identifier with a threshold of 0.65.
|
213 |
This step removes over 60% of the whole data.
|
214 |
"""),
|
215 |
+
DV(
|
216 |
"data/sample_non_en.json",
|
217 |
3,
|
218 |
"Sample documents that are classified as non-English",
|
219 |
),
|
220 |
+
DV(
|
221 |
"data/sample_en_low.json",
|
222 |
3,
|
223 |
"Sample documents that are classified as English but with score less than 0.65",
|
|
|
233 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
234 |
4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
|
235 |
"""),
|
236 |
+
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
237 |
P("""
|
238 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
239 |
"""),
|
240 |
+
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
241 |
+
DV(
|
|
|
|
|
242 |
"data/bad_url_doc.jsonl",
|
243 |
3,
|
244 |
"Sample documents whose urls are blocked by the refined url blocklist",
|
|
|
247 |
P("""
|
248 |
To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
|
249 |
"""),
|
250 |
+
DVS(
|
251 |
non_web_urls,
|
252 |
"curated url domains that are excluded from our dataset",
|
253 |
),
|
254 |
+
DV(
|
255 |
"data/sample_url_exclusion.json",
|
256 |
0,
|
257 |
"Sample documents whose urls are in our curated url domain list",
|
|
|
270 |
of 56,292 additional lines, resulting in the complete exclusion of 2,203 documents from a total of 13,560
|
271 |
documents (16.25%). Accordingly, we choose to not use terminal punctuation as a signal to remove lines.
|
272 |
"""),
|
273 |
+
DV(
|
274 |
"data/sample_terminal_punc.json",
|
275 |
0,
|
276 |
"Sample documents with lines that are removed by the rule of terminal punctuation",
|
|
|
283 |
propose to refine the strategy by adding one more keyword to the word "javascript" to avoid false positives.
|
284 |
The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
|
285 |
"""),
|
286 |
+
DV(
|
287 |
"data/sample_java.jsonl",
|
288 |
0,
|
289 |
"Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
|
|
|
296 |
- The line matches the pattern “r'^\\d+\\s+likes$'”,
|
297 |
- The line contains only one word.
|
298 |
"""),
|
299 |
+
DV(
|
300 |
"data/sample_refinedweb_line.json",
|
301 |
0,
|
302 |
"Sample documents with lines that are removed by the RefinedWeb rules",
|
|
|
309 |
line is in the first 3 lines or in the last 3 lines) to remove toxic lines. Specifically, we do not only consider
|
310 |
the bad words from English but also consider the bad words from other languages.
|
311 |
"""),
|
312 |
+
DVS(
|
313 |
json.load(open("data/toxic_lines.json")),
|
314 |
"Sample documents with toxic lines",
|
315 |
),
|
|
|
317 |
P("""
|
318 |
In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
|
319 |
Overview of all the quality signals that are used for filtering."""),
|
320 |
+
DVS(
|
321 |
json.load(open("data/all_signals.json")),
|
322 |
"Overview of all the quality signals that are used for filtering",
|
323 |
),
|
|
|
366 |
ensures consistency with the overall document character count calculation.
|
367 |
"""),
|
368 |
H5("Our Implementation"),
|
369 |
+
DV(
|
370 |
+
"data/repeat_line_frac.jsonl",
|
371 |
+
0,
|
372 |
+
"Sample documents filtered by excessive line repetitions / characters in repeated lines",
|
373 |
),
|
374 |
H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
375 |
P("""
|
|
|
393 |
only once — tend to be short.
|
394 |
"""),
|
395 |
H5("Our Implementations"),
|
396 |
+
DV(
|
397 |
+
"data/sample_top_ngram.json",
|
398 |
+
0,
|
399 |
+
"Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
|
400 |
),
|
401 |
H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
402 |
P("""
|
|
|
423 |
We decided to use the RedPajama V2 implementation but skip the 1st occurrence of the duplicate n-gram.
|
424 |
"""),
|
425 |
H5("Our Implementations"),
|
|
|
|
|
|
|
|
|
426 |
H5("An Example to Show the Difference Between Above Implementations"),
|
427 |
P("..."), # Add specific examples if available
|
428 |
H5(
|
429 |
"Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
|
430 |
),
|
431 |
+
DV(
|
432 |
+
"data/sample_dup_ngram.json",
|
433 |
+
0,
|
434 |
+
"Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
|
435 |
),
|
436 |
H4("3.2 Line-wise Heuristics"),
|
437 |
P("""
|
|
|
440 |
works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
|
441 |
90% of lines start with a bullet point.
|
442 |
"""),
|
443 |
+
DV(
|
444 |
+
"data/line_info.json",
|
445 |
+
0,
|
446 |
+
"Sample documents that are filtered out by line-wise heuristics",
|
447 |
),
|
448 |
H4("3.3 Statistics-based Heuristics"),
|
449 |
P("""
|
|
|
503 |
The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
|
504 |
to split text into sentences.
|
505 |
"""),
|
|
|
|
|
|
|
|
|
506 |
P("""
|
507 |
However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
|
508 |
we opted to use `nltk.tokenize.sent_tokenize` for more accurate sentence splitting.
|
|
|
516 |
Following RedPajama-V2 and DataTrove, we use the symbols of ("#", "...", "…").
|
517 |
We calculate the ratio as the number of symbols divided by the total number of words.
|
518 |
"""),
|
|
|
|
|
|
|
|
|
519 |
H5("Fraction of Alphabetic Words"),
|
520 |
P("""
|
521 |
Implementations from Dolma
|
|
|
539 |
alt="Sample documents filtered by number of stop words",
|
540 |
),
|
541 |
H5("Our Implementations"),
|
542 |
+
DV(
|
543 |
+
"data/sample_doc_stat.json",
|
544 |
+
0,
|
545 |
+
"Sample documents that are filtered out by statistics-based heuristics",
|
546 |
),
|
547 |
H4("3.4 Others"),
|
548 |
P("""
|
549 |
Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
|
550 |
text.
|
551 |
"""),
|
552 |
+
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum"),
|
|
|
|
|
|
|
553 |
H3("4. Deduplication"),
|
554 |
P("..."), # Add detailed content and images as needed
|
555 |
H3("5. PII Removal"),
|