Add citations and other fixes

#4
Files changed (3) hide show
  1. common.py +4 -3
  2. curated.py +5 -5
  3. main.py +10 -4
common.py CHANGED
@@ -308,7 +308,7 @@ global_div = Div(
308
  "Deduplication is beneficial for LM pretraining in several ways, with the most important being controllable upsampling. With unique data, teams gain fine-grained control over the training data. Other benefits of deduplication include avoiding train-test overlap which prevents evaluation contamination."
309
  ),
310
  P(
311
- "Duplicate data can lead to a strong double descent phenomenon, where repeated data causes test loss to increase midway through training [2]. Additionally, it reduces the risk of memorization [1]. By implementing deduplication and selective upsampling, we gain control over the pretraining data distribution, rather than relying on the inherent distribution of the source."
312
  ),
313
  P(
314
  "To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
@@ -406,6 +406,7 @@ global_div = Div(
406
  "In sizable clusters comprising 1000 or more documents, we observe a trend towards templatization. This involves the recurrent use of standardized language to convey general topics such as terms and conditions, warnings, and disclaimers. Such language is prevalent on commercial websites, offering a consistent and efficient way to communicate commonly encountered information."
407
  ),
408
  Img(src="images/image9.png", style="max-width: 100%;"),
 
409
  ),
410
  Section(
411
  H2("Personally Identifiable Information Removal"),
@@ -435,7 +436,7 @@ global_div = Div(
435
  style="list-style-type: none",
436
  ),
437
  ),
438
- id="section47",
439
  ),
440
  Section(
441
  H2("Normalization Form C"),
@@ -455,7 +456,7 @@ global_div = Div(
455
  style="list-style-type: none",
456
  )
457
  ), # "background-color= gray" "color= blue" maybe add this later
458
- id="section48",
459
  ),
460
  Section(
461
  H3("NFC Examples"),
 
308
  "Deduplication is beneficial for LM pretraining in several ways, with the most important being controllable upsampling. With unique data, teams gain fine-grained control over the training data. Other benefits of deduplication include avoiding train-test overlap which prevents evaluation contamination."
309
  ),
310
  P(
311
+ "Duplicate data can lead to a strong double descent phenomenon, where repeated data causes test loss to increase midway through training", D_cite(bibtex_key="hernandez2022scaling"), ". Additionally, it reduces the risk of memorization", D_cite(bibtex_key="lee2022deduplicating"), ". By implementing deduplication and selective upsampling, we gain control over the pretraining data distribution, rather than relying on the inherent distribution of the source."
312
  ),
313
  P(
314
  "To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
 
406
  "In sizable clusters comprising 1000 or more documents, we observe a trend towards templatization. This involves the recurrent use of standardized language to convey general topics such as terms and conditions, warnings, and disclaimers. Such language is prevalent on commercial websites, offering a consistent and efficient way to communicate commonly encountered information."
407
  ),
408
  Img(src="images/image9.png", style="max-width: 100%;"),
409
+ id="section47",
410
  ),
411
  Section(
412
  H2("Personally Identifiable Information Removal"),
 
436
  style="list-style-type: none",
437
  ),
438
  ),
439
+ id="section48",
440
  ),
441
  Section(
442
  H2("Normalization Form C"),
 
456
  style="list-style-type: none",
457
  )
458
  ), # "background-color= gray" "color= blue" maybe add this later
459
+ id="section49",
460
  ),
461
  Section(
462
  H3("NFC Examples"),
curated.py CHANGED
@@ -296,7 +296,7 @@ table_div_hn = Div(NotStr(table_html_hn))
296
  uirc_filter = pd.DataFrame(
297
  {
298
  "Dataset": [
299
- "Ubunutu IRC",
300
  ],
301
  "Lines Downloaded": [
302
  "37966",
@@ -854,7 +854,7 @@ filtering_process = Div(
854
  style="margin-bottom: -3px",
855
  ),
856
  Li(
857
- "Paragraph Count Filter: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability",
858
  style="margin-bottom: -3px",
859
  ),
860
  Li(
@@ -1140,7 +1140,7 @@ filtering_process = Div(
1140
  Raw single line in data: <P> Hi I am speaker
1141
  After tag removal: P Hi I am speaker
1142
  We remove everything that starts with ["P", "BRK", "CHAPTER", "/P"]
1143
- and only keep tagnae == SPEAKER
1144
  because line starting with <SPEAKER> TEXT TEXT ....... has the relevant text
1145
  """,
1146
  style="block",
@@ -1217,7 +1217,7 @@ filtering_process = Div(
1217
  style="margin-bottom: -3px",
1218
  ),
1219
  Li(
1220
- "As discussed above, the comment heirarchies required a thoughful approach to extracting meaningful data. ",
1221
  style="margin-bottom: -3px",
1222
  ),
1223
  Li(
@@ -1374,7 +1374,7 @@ filtering_process = Div(
1374
  P(B("Unique Data Preparation Challenges: ")),
1375
  Ul(
1376
  Li(
1377
- "Handling code block was a required finding the specific blocks and exacting the details in one snippet.",
1378
  style="margin-bottom: -3px",
1379
  ),
1380
  Li(
 
296
  uirc_filter = pd.DataFrame(
297
  {
298
  "Dataset": [
299
+ "Ubuntu IRC",
300
  ],
301
  "Lines Downloaded": [
302
  "37966",
 
854
  style="margin-bottom: -3px",
855
  ),
856
  Li(
857
+ "Paragraph Count Filter: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log word probability",
858
  style="margin-bottom: -3px",
859
  ),
860
  Li(
 
1140
  Raw single line in data: <P> Hi I am speaker
1141
  After tag removal: P Hi I am speaker
1142
  We remove everything that starts with ["P", "BRK", "CHAPTER", "/P"]
1143
+ and only keep tagname == SPEAKER
1144
  because line starting with <SPEAKER> TEXT TEXT ....... has the relevant text
1145
  """,
1146
  style="block",
 
1217
  style="margin-bottom: -3px",
1218
  ),
1219
  Li(
1220
+ "As discussed above, the comment hierarchies required a thoughtful approach to extracting meaningful data. ",
1221
  style="margin-bottom: -3px",
1222
  ),
1223
  Li(
 
1374
  P(B("Unique Data Preparation Challenges: ")),
1375
  Ul(
1376
  Li(
1377
+ "Handling code block was a required finding the specific blocks and extracting the details in one snippet.",
1378
  style="margin-bottom: -3px",
1379
  ),
1380
  Li(
main.py CHANGED
@@ -328,16 +328,22 @@ def main():
328
  ),
329
  Li(
330
  A(
331
- "Personally Identifiable Information Removal",
332
  href="#section47",
333
  )
334
  ),
335
  Li(
336
  A(
337
- "Normalization Form C",
338
  href="#section48",
339
  )
340
  ),
 
 
 
 
 
 
341
  ),
342
  ),
343
  Div(
@@ -872,7 +878,7 @@ def intro():
872
  D_cite(bibtex_key="dclm"),
873
  "and RedPajama V2,",
874
  D_cite(bibtex_key="redpajama-v2"),
875
- "we also hope to provide a dataset at this scale that is ready to go, without requiring futher filtering."
876
  ),
877
  P(
878
  B("How to Read this Blog Post?"),
@@ -884,7 +890,7 @@ def intro():
884
  Section(
885
  H2("Why TxT360"),
886
  P(
887
- "In this year we have seen excellent datasets released by the community. Among those, most datasets focus on one source (e.g., crawled websites, code bases, papers). However, it is not trivial to combine these sources together due to the potential duplicaiton across them. TxT360 is the first dataset to combine most of sources commonly used in pretraining."
888
  ),
889
  new_table_div_1,
890
  # table_div_1,
 
328
  ),
329
  Li(
330
  A(
331
+ "Analysis of Near-Duplicate Clusters",
332
  href="#section47",
333
  )
334
  ),
335
  Li(
336
  A(
337
+ "Personally Identifiable Information Removal",
338
  href="#section48",
339
  )
340
  ),
341
+ Li(
342
+ A(
343
+ "Normalization Form C",
344
+ href="#section49",
345
+ )
346
+ ),
347
  ),
348
  ),
349
  Div(
 
878
  D_cite(bibtex_key="dclm"),
879
  "and RedPajama V2,",
880
  D_cite(bibtex_key="redpajama-v2"),
881
+ "we also hope to provide a dataset at this scale that is ready to go, without requiring further filtering."
882
  ),
883
  P(
884
  B("How to Read this Blog Post?"),
 
890
  Section(
891
  H2("Why TxT360"),
892
  P(
893
+ "In this year we have seen excellent datasets released by the community. Among those, most datasets focus on one source (e.g., crawled websites, code bases, papers). However, it is not trivial to combine these sources together due to the potential duplication across them. TxT360 is the first dataset to combine most of sources commonly used in pretraining."
894
  ),
895
  new_table_div_1,
896
  # table_div_1,