mtasic85 commited on
Commit
cd9e649
·
1 Parent(s): 60b8a72
Files changed (2) hide show
  1. config.json +40 -0
  2. scripts/prepare_contrain_dataset.py +18 -29
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "tangledgroup/tangled-llama-j-128k-v0.1",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "eos_token_id": [
10
+ 1,
11
+ 4,
12
+ 5
13
+ ],
14
+ "head_dim": 64,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 768,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 2048,
19
+ "max_position_embeddings": 131072,
20
+ "mlp_bias": false,
21
+ "model_type": "llama",
22
+ "num_attention_heads": 16,
23
+ "num_hidden_layers": 32,
24
+ "num_key_value_heads": 4,
25
+ "pretraining_tp": 1,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_scaling": {
28
+ "factor": 32.0,
29
+ "high_freq_factor": 4.0,
30
+ "low_freq_factor": 1.0,
31
+ "original_max_position_embeddings": 8192,
32
+ "rope_type": "llama3"
33
+ },
34
+ "rope_theta": 1000000.0,
35
+ "tie_word_embeddings": true,
36
+ "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.45.0.dev0",
38
+ "use_cache": true,
39
+ "vocab_size": 65536
40
+ }
scripts/prepare_contrain_dataset.py CHANGED
@@ -93,6 +93,24 @@ datasets_configs = [
93
  #
94
  # general instructs
95
  #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  # arcee-ai/The-Tome - 4.58 GB, 1,752,473
97
  # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
98
  # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
@@ -174,36 +192,7 @@ datasets_configs = [
174
  for i in range(0, 100, 20)
175
  ],
176
 
177
- # mlabonne/open-perfectblend - 1.48 GB, 1,420,909
178
- # meta-math/MetaMathQA 395,000
179
- # openbmb/UltraInteract_sft 288,579
180
- # HuggingFaceH4/ultrachat_200k 207,865
181
- # microsoft/orca-math-word-problems-200k 200,035
182
- # HuggingFaceH4/ultrafeedback_binarized 187,405
183
- # theblackcat102/evol-codealpaca-v1 111,272
184
- # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
185
- # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
186
- [
187
- {'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
188
- {'role': roles_map[m['from']], 'content': m['value']}
189
- for m in msgs
190
- ]}
191
- for i in range(0, 100, 20)
192
- ],
193
 
194
- #
195
- # math
196
- #
197
- ## 6.07 GB, 11,402,286
198
- # [
199
- # {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
200
- # for i in range(0, 100, 10)
201
- # ],
202
- ## 912 MB, 2,570,505
203
- # [
204
- # {'path': 'ai2-adapt-dev/openmath-2-gsm8k', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
205
- # for i in range(0, 100, 10)
206
- # ],
207
 
208
  #
209
  # tool/function calling
 
93
  #
94
  # general instructs
95
  #
96
+
97
+ # mlabonne/open-perfectblend - 1.48 GB, 1,420,909
98
+ # meta-math/MetaMathQA 395,000
99
+ # openbmb/UltraInteract_sft 288,579
100
+ # HuggingFaceH4/ultrachat_200k 207,865
101
+ # microsoft/orca-math-word-problems-200k 200,035
102
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
103
+ # theblackcat102/evol-codealpaca-v1 111,272
104
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
105
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
106
+ [
107
+ {'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
108
+ {'role': roles_map[m['from']], 'content': m['value']}
109
+ for m in msgs
110
+ ]}
111
+ for i in range(0, 100, 20)
112
+ ],
113
+
114
  # arcee-ai/The-Tome - 4.58 GB, 1,752,473
115
  # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
116
  # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
 
192
  for i in range(0, 100, 20)
193
  ],
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  #
198
  # tool/function calling