mtasic85 commited on
Commit
0342add
·
1 Parent(s): fe25a83

prepare pretrain datasets

Browse files
Files changed (2) hide show
  1. scripts/pretrain_datasets.py +19 -14
  2. scripts/utils.py +1 -54
scripts/pretrain_datasets.py CHANGED
@@ -4,13 +4,13 @@ pretrain_datasets = [
4
  #
5
  # 3.17 GB, 2,226,907
6
  *[
7
- {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
8
- for i in range(0, 100, 5)
9
  ],
10
  # 1.64 GB, 1,001,000
11
  *[
12
- {'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
13
- for i in range(0, 100, 5)
14
  ],
15
 
16
  #
@@ -32,9 +32,14 @@ pretrain_datasets = [
32
  #
33
  # math
34
  #
35
- # 12.6 GB, 21,972,791 - we use 1M subset - 639 MB, 1,000,000
 
 
 
 
 
36
  *[
37
- {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 5}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
38
  for i in range(0, 100, 5)
39
  ],
40
 
@@ -43,12 +48,12 @@ pretrain_datasets = [
43
  #
44
  # 1.44 GB, 63,357
45
  *[
46
- {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
47
- for i in range(0, 100, 5)
48
  ],
49
  *[
50
- {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
51
- for i in range(0, 100, 5)
52
  ],
53
 
54
  #
@@ -56,8 +61,8 @@ pretrain_datasets = [
56
  #
57
  # 7.81 GB, ~2,804,025
58
  *[
59
- {'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 5}%]', 'format': '{input} {output}'}
60
- for i in range(0, 100, 5)
61
  ],
62
 
63
  #
@@ -65,8 +70,8 @@ pretrain_datasets = [
65
  #
66
  # 3.18 GB, 1,010,500 - paper says that extracted is 6GB
67
  *[
68
- {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
69
- for i in range(0, 100, 5)
70
  ],
71
  {'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
72
  {'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
 
4
  #
5
  # 3.17 GB, 2,226,907
6
  *[
7
+ {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
8
+ for i in range(0, 100, 10)
9
  ],
10
  # 1.64 GB, 1,001,000
11
  *[
12
+ {'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
13
+ for i in range(0, 100, 10)
14
  ],
15
 
16
  #
 
32
  #
33
  # math
34
  #
35
+ # # 12.6 GB, 21,972,791 - we use 1M subset - 639 MB, 1,000,000
36
+ # *[
37
+ # {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 10}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
38
+ # for i in range(0, 100, 10)
39
+ # ],
40
+ # 12.6 GB, 14M rows
41
  *[
42
+ {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train[{i}%:{i + 5}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
43
  for i in range(0, 100, 5)
44
  ],
45
 
 
48
  #
49
  # 1.44 GB, 63,357
50
  *[
51
+ {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
52
+ for i in range(0, 100, 10)
53
  ],
54
  *[
55
+ {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['markdown']}
56
+ for i in range(0, 100, 10)
57
  ],
58
 
59
  #
 
61
  #
62
  # 7.81 GB, ~2,804,025
63
  *[
64
+ {'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 10}%]', 'format': '{input} {output}'}
65
+ for i in range(0, 100, 10)
66
  ],
67
 
68
  #
 
70
  #
71
  # 3.18 GB, 1,010,500 - paper says that extracted is 6GB
72
  *[
73
+ {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
74
+ for i in range(0, 100, 10)
75
  ],
76
  {'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
77
  {'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
scripts/utils.py CHANGED
@@ -6,7 +6,7 @@ from datasets import load_dataset
6
  from litgpt.tokenizer import Tokenizer
7
  from transformers import AutoTokenizer
8
 
9
- # def _batch_text_iterator(path: str,
10
  def batch_text_iterator(path: str,
11
  name: Optional[str]=None,
12
  data_dir: Optional[str]=None,
@@ -41,7 +41,6 @@ def batch_text_iterator(path: str,
41
  gc.collect()
42
 
43
 
44
- # def _batch_chat_iterator(path: str,
45
  def batch_chat_iterator(path: str,
46
  name: Optional[str]=None,
47
  data_dir: Optional[str]=None,
@@ -84,66 +83,14 @@ def batch_chat_iterator(path: str,
84
  gc.collect()
85
 
86
 
87
- # def batch_text_iterator(dataset_config: Union[list, dict]) -> Iterator[str]:
88
- # assert isinstance(dataset_config, (dict, list)), dataset_config
89
- #
90
- # if isinstance(dataset_config, dict):
91
- # for text in _batch_text_iterator(**dataset_config):
92
- # yield text
93
- # elif isinstance(dataset_config, list):
94
- # for dc in dataset_config:
95
- # for text in _batch_text_iterator(**dc):
96
- # yield text
97
-
98
-
99
- # def batch_chat_iterator(dataset_config: Union[list, dict]) -> Iterator[list[dict[str, str]]]:
100
- # assert isinstance(dataset_config, (dict, list)), dataset_config
101
- #
102
- # if isinstance(dataset_config, dict):
103
- # for messages in _batch_chat_iterator(**dataset_config):
104
- # yield messages
105
- # elif isinstance(dataset_config, list):
106
- # for dc in dataset_config:
107
- # for messages in _batch_chat_iterator(**dc):
108
- # yield messages
109
-
110
-
111
- # def tokenize_text_fn(dataset_config: list, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
112
  def tokenize_text_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
113
  for text in batch_text_iterator(**dataset_config):
114
  text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
115
-
116
- # if min_len is None and max_len is None:
117
- # yield text_ids
118
-
119
- # if min_len is None:
120
- # min_len = 0
121
-
122
- # if max_len is None:
123
- # max_len = len(text_ids)
124
-
125
- # if min_len <= len(text_ids) <= max_len:
126
- # yield text_ids
127
-
128
  yield text_ids
129
 
130
 
131
- # def tokenize_chat_fn(dataset_config: list, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
132
  def tokenize_chat_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
133
  for messages in batch_chat_iterator(**dataset_config):
134
  text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
135
  text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
136
-
137
- # if min_len is None and max_len is None:
138
- # yield text_ids
139
-
140
- # if min_len is None:
141
- # min_len = 0
142
-
143
- # if max_len is None:
144
- # max_len = len(text_ids)
145
-
146
- # if min_len <= len(text_ids) <= max_len:
147
- # yield text_ids
148
-
149
  yield text_ids
 
6
  from litgpt.tokenizer import Tokenizer
7
  from transformers import AutoTokenizer
8
 
9
+
10
  def batch_text_iterator(path: str,
11
  name: Optional[str]=None,
12
  data_dir: Optional[str]=None,
 
41
  gc.collect()
42
 
43
 
 
44
  def batch_chat_iterator(path: str,
45
  name: Optional[str]=None,
46
  data_dir: Optional[str]=None,
 
83
  gc.collect()
84
 
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def tokenize_text_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
87
  for text in batch_text_iterator(**dataset_config):
88
  text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  yield text_ids
90
 
91
 
 
92
  def tokenize_chat_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
93
  for messages in batch_chat_iterator(**dataset_config):
94
  text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
95
  text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  yield text_ids