config
Browse files- config.json +40 -0
- scripts/prepare_contrain_dataset.py +18 -29
config.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "tangledgroup/tangled-llama-j-128k-v0.1",
|
3 |
+
"architectures": [
|
4 |
+
"LlamaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"eos_token_id": [
|
10 |
+
1,
|
11 |
+
4,
|
12 |
+
5
|
13 |
+
],
|
14 |
+
"head_dim": 64,
|
15 |
+
"hidden_act": "silu",
|
16 |
+
"hidden_size": 768,
|
17 |
+
"initializer_range": 0.02,
|
18 |
+
"intermediate_size": 2048,
|
19 |
+
"max_position_embeddings": 131072,
|
20 |
+
"mlp_bias": false,
|
21 |
+
"model_type": "llama",
|
22 |
+
"num_attention_heads": 16,
|
23 |
+
"num_hidden_layers": 32,
|
24 |
+
"num_key_value_heads": 4,
|
25 |
+
"pretraining_tp": 1,
|
26 |
+
"rms_norm_eps": 1e-05,
|
27 |
+
"rope_scaling": {
|
28 |
+
"factor": 32.0,
|
29 |
+
"high_freq_factor": 4.0,
|
30 |
+
"low_freq_factor": 1.0,
|
31 |
+
"original_max_position_embeddings": 8192,
|
32 |
+
"rope_type": "llama3"
|
33 |
+
},
|
34 |
+
"rope_theta": 1000000.0,
|
35 |
+
"tie_word_embeddings": true,
|
36 |
+
"torch_dtype": "bfloat16",
|
37 |
+
"transformers_version": "4.45.0.dev0",
|
38 |
+
"use_cache": true,
|
39 |
+
"vocab_size": 65536
|
40 |
+
}
|
scripts/prepare_contrain_dataset.py
CHANGED
@@ -93,6 +93,24 @@ datasets_configs = [
|
|
93 |
#
|
94 |
# general instructs
|
95 |
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
# arcee-ai/The-Tome - 4.58 GB, 1,752,473
|
97 |
# - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
|
98 |
# - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
|
@@ -174,36 +192,7 @@ datasets_configs = [
|
|
174 |
for i in range(0, 100, 20)
|
175 |
],
|
176 |
|
177 |
-
# mlabonne/open-perfectblend - 1.48 GB, 1,420,909
|
178 |
-
# meta-math/MetaMathQA 395,000
|
179 |
-
# openbmb/UltraInteract_sft 288,579
|
180 |
-
# HuggingFaceH4/ultrachat_200k 207,865
|
181 |
-
# microsoft/orca-math-word-problems-200k 200,035
|
182 |
-
# HuggingFaceH4/ultrafeedback_binarized 187,405
|
183 |
-
# theblackcat102/evol-codealpaca-v1 111,272
|
184 |
-
# Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
|
185 |
-
# mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
|
186 |
-
[
|
187 |
-
{'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
|
188 |
-
{'role': roles_map[m['from']], 'content': m['value']}
|
189 |
-
for m in msgs
|
190 |
-
]}
|
191 |
-
for i in range(0, 100, 20)
|
192 |
-
],
|
193 |
|
194 |
-
#
|
195 |
-
# math
|
196 |
-
#
|
197 |
-
## 6.07 GB, 11,402,286
|
198 |
-
# [
|
199 |
-
# {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
|
200 |
-
# for i in range(0, 100, 10)
|
201 |
-
# ],
|
202 |
-
## 912 MB, 2,570,505
|
203 |
-
# [
|
204 |
-
# {'path': 'ai2-adapt-dev/openmath-2-gsm8k', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
|
205 |
-
# for i in range(0, 100, 10)
|
206 |
-
# ],
|
207 |
|
208 |
#
|
209 |
# tool/function calling
|
|
|
93 |
#
|
94 |
# general instructs
|
95 |
#
|
96 |
+
|
97 |
+
# mlabonne/open-perfectblend - 1.48 GB, 1,420,909
|
98 |
+
# meta-math/MetaMathQA 395,000
|
99 |
+
# openbmb/UltraInteract_sft 288,579
|
100 |
+
# HuggingFaceH4/ultrachat_200k 207,865
|
101 |
+
# microsoft/orca-math-word-problems-200k 200,035
|
102 |
+
# HuggingFaceH4/ultrafeedback_binarized 187,405
|
103 |
+
# theblackcat102/evol-codealpaca-v1 111,272
|
104 |
+
# Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
|
105 |
+
# mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
|
106 |
+
[
|
107 |
+
{'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
|
108 |
+
{'role': roles_map[m['from']], 'content': m['value']}
|
109 |
+
for m in msgs
|
110 |
+
]}
|
111 |
+
for i in range(0, 100, 20)
|
112 |
+
],
|
113 |
+
|
114 |
# arcee-ai/The-Tome - 4.58 GB, 1,752,473
|
115 |
# - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
|
116 |
# - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
|
|
|
192 |
for i in range(0, 100, 20)
|
193 |
],
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
#
|
198 |
# tool/function calling
|