victormiller commited on
Commit
ed0e179
1 Parent(s): 6a7bb93

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +36 -318
curated.py CHANGED
@@ -455,34 +455,6 @@ data_sources = [
455
  "Europarl",
456
  ]
457
 
458
- def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
459
- doc_id = max(0, min(int(doc_id), 9))
460
-
461
- if data_source == "Freelaw":
462
- raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
463
- extracted_sample_doc = json.load(
464
- open("data/curated_samples/freelaw_extract.json")
465
- )
466
- else:
467
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
468
-
469
- raw_json = raw_sample_doc[doc_id]
470
- extracted_json = extracted_sample_doc[doc_id]
471
- return view_data(
472
- raw_json,
473
- extracted_json,
474
- doc_id=doc_id,
475
- data_source="Freelaw",
476
- data_sources="Freelaw",
477
- target=target,
478
- )
479
-
480
- freelaw_examples = Div(
481
- Div(
482
- get_freelaw_data(target=gen_random_id()),
483
- style="border: 1px solid #ccc; padding: 20px;",
484
- ),
485
- )
486
 
487
 
488
  def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
@@ -513,261 +485,7 @@ wiki_examples = Div(
513
  ),
514
  )
515
 
516
- def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
517
- doc_id = max(0, min(int(doc_id), 9))
518
-
519
- if data_source == "StackExchange":
520
- raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
521
- extracted_sample_doc = json.load(
522
- open("data/curated_samples/stackexchange_extract.json")
523
- )
524
- else:
525
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
526
-
527
- raw_json = raw_sample_doc[doc_id]
528
- extracted_json = extracted_sample_doc[doc_id]
529
- return view_data(
530
- raw_json,
531
- extracted_json,
532
- doc_id=doc_id,
533
- data_source="StackExchange",
534
- data_sources="StackExchange",
535
- target=target,
536
- )
537
-
538
- se_examples = Div(
539
- Div(
540
- get_se_data(target=gen_random_id()),
541
- style="border: 1px solid #ccc; padding: 20px;",
542
- ),
543
- )
544
-
545
- def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
546
- doc_id = max(0, min(int(doc_id), 9))
547
-
548
- if data_source == "PhilPapers":
549
- raw_sample_doc = extracted_sample_doc = json.load(
550
- open("data/curated_samples/philpapers_raw.json")
551
- )
552
- else:
553
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
554
-
555
- raw_json = raw_sample_doc[doc_id]
556
- extracted_json = extracted_sample_doc[doc_id]
557
- return view_data(
558
- raw_json,
559
- extracted_json,
560
- doc_id=doc_id,
561
- data_source="PhilPapers",
562
- data_sources="PhilPapers",
563
- target=target,
564
- )
565
-
566
- phil_examples = Div(
567
- Div(
568
- get_phil_data(target=gen_random_id()),
569
- style="border: 1px solid #ccc; padding: 20px;",
570
- ),
571
- )
572
-
573
- def get_arx_data(data_source: str = "Arxiv", doc_id: int = 3, target: str = "foo"):
574
- doc_id = max(0, min(int(doc_id), 9))
575
-
576
- if data_source == "Arxiv":
577
- raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
578
- extracted_sample_doc = json.load(
579
- open("data/curated_samples/arxiv_extract.json")
580
- )
581
- else:
582
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
583
-
584
- raw_json = raw_sample_doc[doc_id]
585
- extracted_json = extracted_sample_doc[doc_id]
586
- return view_data(
587
- raw_json,
588
- extracted_json,
589
- doc_id=doc_id,
590
- data_source="Arxiv",
591
- data_sources="Arxiv",
592
- target=target,
593
- )
594
-
595
- arx_examples = Div(
596
- Div(
597
- get_arx_data(target=gen_random_id()),
598
- style="border: 1px solid #ccc; padding: 20px;",
599
- ),
600
- )
601
-
602
- def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
603
- doc_id = max(0, min(int(doc_id), 9))
604
-
605
- if data_source == "S2ORC":
606
- raw_sample_doc = extracted_sample_doc = json.load(
607
- open("data/curated_samples/s2orc_raw.json")
608
- )
609
- else:
610
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
611
-
612
- raw_json = raw_sample_doc[doc_id]
613
- extracted_json = extracted_sample_doc[doc_id]
614
- return view_data(
615
- raw_json,
616
- extracted_json,
617
- doc_id=doc_id,
618
- data_source="S2ORC",
619
- data_sources="S2ORC",
620
- target=target,
621
- )
622
-
623
- s2o_examples = Div(
624
- Div(
625
- get_S2ORC_data(target=gen_random_id()),
626
- style="border: 1px solid #ccc; padding: 20px;",
627
- ),
628
- )
629
-
630
- def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
631
- doc_id = max(0, min(int(doc_id), 9))
632
-
633
- if data_source == "S2ORC":
634
- raw_sample_doc = extracted_sample_doc = json.load(
635
- open("data/curated_samples/s2orc_abstract_raw.json")
636
- )
637
- else:
638
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
639
-
640
- raw_json = raw_sample_doc[doc_id]
641
- extracted_json = extracted_sample_doc[doc_id]
642
- return view_data(
643
- raw_json,
644
- extracted_json,
645
- doc_id=doc_id,
646
- data_source="S2ORC Abstract",
647
- data_sources="S2ORC Abstract",
648
- target=target,
649
- )
650
-
651
- s2oa_examples = Div(
652
- Div(
653
- get_S2ORCA_data(target=gen_random_id()),
654
- style="border: 1px solid #ccc; padding: 20px;",
655
- ),
656
- )
657
-
658
- def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
659
- doc_id = max(0, min(int(doc_id), 9))
660
-
661
- if data_source == "Pubmed":
662
- raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
663
- extracted_sample_doc = json.load(
664
- open("data/curated_samples/pubmed_extract.json")
665
- )
666
- else:
667
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
668
-
669
- raw_json = raw_sample_doc[doc_id]
670
- extracted_json = extracted_sample_doc[doc_id]
671
- return view_data(
672
- raw_json,
673
- extracted_json,
674
- doc_id=doc_id,
675
- data_source="Pubmed",
676
- data_sources="Pubmed",
677
- target=target,
678
- )
679
-
680
- pubmed_examples = Div(
681
- Div(
682
- get_pubmed_data(target=gen_random_id()),
683
- style="border: 1px solid #ccc; padding: 20px;",
684
- ),
685
- )
686
-
687
- def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
688
- doc_id = max(0, min(int(doc_id), 9))
689
-
690
- if data_source == "DM Math":
691
- raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
692
- extracted_sample_doc = json.load(
693
- open("data/curated_samples/dm_maths_extract.json")
694
- )
695
- else:
696
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
697
-
698
- raw_json = raw_sample_doc[doc_id]
699
- extracted_json = extracted_sample_doc[doc_id]
700
- return view_data(
701
- raw_json,
702
- extracted_json,
703
- doc_id=doc_id,
704
- data_source="DM Math",
705
- data_sources="DM Math",
706
- target=target,
707
- )
708
-
709
- dmm_examples = Div(
710
- Div(
711
- get_dmm_data(target=gen_random_id()),
712
- style="border: 1px solid #ccc; padding: 20px;",
713
- ),
714
- )
715
-
716
- def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
717
- doc_id = max(0, min(int(doc_id), 9))
718
-
719
- if data_source == "PG19":
720
- raw_sample_doc = extracted_sample_doc = json.load(
721
- open("data/curated_samples/pg19_raw.json")
722
- )
723
- else:
724
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
725
-
726
- raw_json = raw_sample_doc[doc_id]
727
- extracted_json = extracted_sample_doc[doc_id]
728
- return view_data(
729
- raw_json,
730
- extracted_json,
731
- doc_id=doc_id,
732
- data_source="PG19",
733
- data_sources="PG19",
734
- target=target,
735
- )
736
-
737
- pg19_examples = Div(
738
- Div(
739
- get_pg19_data(target=gen_random_id()),
740
- style="border: 1px solid #ccc; padding: 20px;",
741
- ),
742
- )
743
-
744
- def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
745
- doc_id = max(0, min(int(doc_id), 9))
746
 
747
- if data_source == "Europarl":
748
- raw_sample_doc = extracted_sample_doc = json.load(
749
- open("data/curated_samples/europarl_raw.json")
750
- )
751
- else:
752
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
753
-
754
- raw_json = raw_sample_doc[doc_id]
755
- extracted_json = extracted_sample_doc[doc_id]
756
- return view_data(
757
- raw_json,
758
- extracted_json,
759
- doc_id=doc_id,
760
- data_source="Europarl",
761
- data_sources="Europarl",
762
- target=target,
763
- )
764
-
765
- eu_examples = Div(
766
- Div(
767
- get_eu_data(target=gen_random_id()),
768
- style="border: 1px solid #ccc; padding: 20px;",
769
- ),
770
- )
771
 
772
  filtering_process = Div(
773
  Section(
@@ -803,10 +521,10 @@ filtering_process = Div(
803
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
804
  ),
805
  table_div_arx,
806
- Details(
807
- Summary("ArXiv Filtering Examples"),
808
- arx_examples,
809
- ),
810
  ),
811
  ),
812
  Section(
@@ -845,10 +563,10 @@ filtering_process = Div(
845
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
846
  ),
847
  table_div_s2o,
848
- Details(
849
- Summary("FreeLaw Filtering Examples -- need to update"),
850
- freelaw_examples,
851
- ),
852
  ),
853
  ),
854
  Section(
@@ -881,10 +599,10 @@ filtering_process = Div(
881
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
882
  ),
883
  table_div_med,
884
- Details(
885
- Summary("PubMed Filtering Examples"),
886
- pubmed_examples,
887
- ),
888
  ),
889
  ),
890
  Section(
@@ -898,10 +616,10 @@ filtering_process = Div(
898
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
899
  ),
900
  table_div_phil,
901
- Details(
902
- Summary("Phil Papers Filtering Examples"),
903
- phil_examples,
904
- ),
905
  ),
906
  ),
907
  Section(
@@ -913,10 +631,10 @@ filtering_process = Div(
913
  H4("Filtering"),
914
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
915
  table_div_up,
916
- Details(
917
- Summary("EuroParl Filtering Examples"),
918
- eu_examples,
919
- ),
920
  ),
921
  ),
922
  Section(
@@ -977,10 +695,10 @@ filtering_process = Div(
977
  Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
978
  ),
979
  table_div_freelaw,
980
- Details(
981
- Summary("FreeLaw Filtering Examples"),
982
- freelaw_examples,
983
- ),
984
 
985
  ),
986
  ),
@@ -1006,10 +724,10 @@ filtering_process = Div(
1006
  Li("Minimum Word Count Filter: 10"),
1007
  ),
1008
  table_div_se,
1009
- Details(
1010
- Summary("StackExchange Filtering Examples"),
1011
- se_examples,
1012
- ),
1013
  ),
1014
  ),
1015
  Section(
@@ -1058,10 +776,10 @@ filtering_process = Div(
1058
  Li("None"),
1059
  ),
1060
  table_div_dmm,
1061
- Details(
1062
- Summary("DM Math Filtering Examples"),
1063
- dmm_examples,
1064
- ),
1065
  ),
1066
  ),
1067
  Section(
@@ -1079,10 +797,10 @@ filtering_process = Div(
1079
  Li("Unigram Log Probability"),
1080
  ),
1081
  table_div_pg19,
1082
- Details(
1083
- Summary("PG-19 Filtering Examples"),
1084
- pg19_examples,
1085
- ),
1086
  ),
1087
  ),
1088
  )
 
455
  "Europarl",
456
  ]
457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
 
460
  def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
 
485
  ),
486
  )
487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
 
490
  filtering_process = Div(
491
  Section(
 
521
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
522
  ),
523
  table_div_arx,
524
+ # Details(
525
+ # Summary("ArXiv Filtering Examples"),
526
+ # arx_examples,
527
+ # ),
528
  ),
529
  ),
530
  Section(
 
563
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
564
  ),
565
  table_div_s2o,
566
+ # Details(
567
+ # Summary("FreeLaw Filtering Examples -- need to update"),
568
+ # freelaw_examples,
569
+ # ),
570
  ),
571
  ),
572
  Section(
 
599
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
600
  ),
601
  table_div_med,
602
+ # Details(
603
+ # Summary("PubMed Filtering Examples"),
604
+ # pubmed_examples,
605
+ # ),
606
  ),
607
  ),
608
  Section(
 
616
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
617
  ),
618
  table_div_phil,
619
+ # Details(
620
+ # Summary("Phil Papers Filtering Examples"),
621
+ # phil_examples,
622
+ # ),
623
  ),
624
  ),
625
  Section(
 
631
  H4("Filtering"),
632
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
633
  table_div_up,
634
+ # Details(
635
+ # Summary("EuroParl Filtering Examples"),
636
+ # eu_examples,
637
+ # ),
638
  ),
639
  ),
640
  Section(
 
695
  Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
696
  ),
697
  table_div_freelaw,
698
+ # Details(
699
+ # Summary("FreeLaw Filtering Examples"),
700
+ # freelaw_examples,
701
+ # ),
702
 
703
  ),
704
  ),
 
724
  Li("Minimum Word Count Filter: 10"),
725
  ),
726
  table_div_se,
727
+ # Details(
728
+ # Summary("StackExchange Filtering Examples"),
729
+ # se_examples,
730
+ # ),
731
  ),
732
  ),
733
  Section(
 
776
  Li("None"),
777
  ),
778
  table_div_dmm,
779
+ # Details(
780
+ # Summary("DM Math Filtering Examples"),
781
+ # dmm_examples,
782
+ # ),
783
  ),
784
  ),
785
  Section(
 
797
  Li("Unigram Log Probability"),
798
  ),
799
  table_div_pg19,
800
+ #Details(
801
+ # Summary("PG-19 Filtering Examples"),
802
+ # pg19_examples,
803
+ #),
804
  ),
805
  ),
806
  )