mtasic85 commited on
Commit
99b145e
·
1 Parent(s): 6f41c4a
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 1,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 512,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 2048,
14
+ "max_position_embeddings": 131072,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 4,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 4,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.45.0.dev0",
27
+ "use_cache": true,
28
+ "vocab_size": 131072
29
+ }
misc/logo.jpg ADDED
scripts/backup/base_instruct_datasets.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+
11
+ core_instruct_datasets = [
12
+ #
13
+ # general instructs
14
+ #
15
+ # 1.48 GB, 1,420,909
16
+ # mlabonne/open-perfectblend
17
+ # meta-math/MetaMathQA 395,000
18
+ # openbmb/UltraInteract_sft 288,579
19
+ # HuggingFaceH4/ultrachat_200k 207,865
20
+ # microsoft/orca-math-word-problems-200k 200,035
21
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
22
+ # theblackcat102/evol-codealpaca-v1 111,272
23
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
24
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
25
+ *[
26
+ {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
27
+ {'role': roles_map[m['from']], 'content': m['value']}
28
+ for m in msgs
29
+ ]}
30
+ for i in range(0, 100, 10)
31
+ ],
32
+ # 1.41 GB, 939,343
33
+ # allenai/tulu-3-sft-mixture
34
+ # CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024)
35
+ # FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023)
36
+ # No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023)
37
+ # OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024)
38
+ # Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts
39
+ # Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts
40
+ # Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts
41
+ # Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts
42
+ # Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts
43
+ # NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024)
44
+ # Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024)
45
+ # Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024)
46
+ # Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts
47
+ # Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024)
48
+ # WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024)
49
+ # TableGPT (MIT), 5,000 prompts (Zha et al., 2023)
50
+ # SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024)
51
+ # Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023)
52
+ *[
53
+ {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
54
+ for i in range(0, 100, 10)
55
+ ],
56
+
57
+ #
58
+ # multilingual instructs
59
+ #
60
+ # 2.48 GB, 5,808,694
61
+ # rombodawg/Everything_Instruct_Multilingual
62
+ # Science:
63
+ # antiven0m/physical-reasoning-dpoScience
64
+ # LawalAfeez/science-dataset
65
+ # Social media:
66
+ # Kyle1668/AG-Tweets
67
+ # euclaise/reddit-instruct-curated
68
+ # General Knowledge:
69
+ # NousResearch/CharacterCodex_Characters
70
+ # jstet/quotes-500k_Famous_Quotes
71
+ # FronkonGames/steam-games-dataset_Video_Games
72
+ # totuta_youtube_subs_howto100M_HowTo
73
+ # Multi-lingual:
74
+ # Amani27/massive_translation_dataset
75
+ # udmurtNLP/udmurt-russian-english-labse
76
+ # grosenthal/latin_english
77
+ # msarmi9/korean-english-multitarget-ted-talks-task
78
+ # HaiderSultanArc/MT-Urdu-English_Translate
79
+ # Garsa3112/ChineseEnglishTranslationDataset
80
+ # Cooking:
81
+ # andrewsiah/se_cooking_preference_sft
82
+ # Hieu-Phamkaggle/food_recipes
83
+ # Writing:
84
+ # shahules786/PoetryFoundationData
85
+ # euclaise/writingprompts
86
+ # qwedsacf/ivypanda-essaysEssay
87
+ # Medicine:
88
+ # keivalya/MedQuad-MedicalQnADataset
89
+ # nuvocare/MSD
90
+ # History:
91
+ # ambrosfitz10k/history_data_v4
92
+ # Law:
93
+ # dzunggg/legal-qa-v1
94
+ # Role-Play:
95
+ # roleplay4/fun_CoupleRP
96
+ # Undi95andrijdavid/roleplay-conversation-sharegpt
97
+ # News:
98
+ # RealTimeData/bbc_news_alltime
99
+ # Coding: (rombodawg/code_bagel)
100
+ # layoric/tiny-codes-alpaca
101
+ # glaiveai/glaive-code-assistant-v3
102
+ # ajibawa-2023/Code-290k-ShareGPT
103
+ # chargoddard/commitpack-ft-instruct-rated
104
+ # iamtarun/code_instructions_120k_alpaca
105
+ # ise-uiuc/Magicoder-Evol-Instruct-110K
106
+ # cognitivecomputations/dolphin-coder
107
+ # nickrosh/Evol-Instruct-Code-80k-v1
108
+ # coseal/CodeUltraFeedback_binarized
109
+ # CyberNative/Code_Vulnerability_Security_DPO
110
+ # Math: (rombodawg/code_bagel)
111
+ # TIGER-Lab/MathInstruct
112
+ # Function calling: (rombodawg/code_bagel)
113
+ # glaiveai/glaive-function-calling-v2
114
+ # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
115
+ # teknium/OpenHermes-2.5
116
+ *[
117
+ {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
118
+ {'role': 'system', 'content': r['instruction']},
119
+ {'role': 'user', 'content': r['input']},
120
+ {'role': 'assistant', 'content': r['output']},
121
+ ]}
122
+ for i in range(0, 100, 10)
123
+ ],
124
+ ]
scripts/backup/base_reason_datasets.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+ R1_SYSTEM_PROMPT = '''\
11
+ You are an AI assistant.
12
+
13
+ Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
14
+
15
+ Formatting Requirements:
16
+ - Always structure your replies using: <think>{reasoning}</think>{answer}
17
+ - The <think></think> block should contain at least six reasoning steps when applicable.
18
+ - If the answer requires minimal thought, the <think></think> block may be left empty.
19
+ - The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
20
+ - If you notice that you have engaged in circular reasoning or repetition, immediately terminate {reasoning} with a </think> and proceed to the {answer}
21
+
22
+ Response Guidelines:
23
+ - Detailed and Structured: Use rich Markdown formatting for clarity and readability.
24
+ - Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
25
+ - Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
26
+ - Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
27
+ - Maintain a professional, intelligent, and analytical tone in all interactions.'''
28
+
29
+ core_reason_datasets = [
30
+ #
31
+ # math reason
32
+ #
33
+ # 8.43 GB, 450,258
34
+ *[
35
+ {'kind': 'instruct', 'path': 'open-r1/OpenR1-Math-220k', 'data_dir': 'data', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages', 'transform': lambda msgs: [
36
+ {'role': roles_map[m['from']], 'content': m['value']}
37
+ for m in msgs
38
+ ]}
39
+ for i in range(0, 100, 10)
40
+ ],
41
+
42
+ #
43
+ # general reason
44
+ #
45
+ # 3.55 GB, 227,914
46
+ *[
47
+ {'kind': 'instruct', 'path': 'open-thoughts/OpenThoughts-114k', 'data_dir': 'data', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
48
+ {'role': 'system', 'content': r['system']}
49
+ ] + [
50
+ {'role': roles_map[m['from']], 'content': m['value']}
51
+ for m in r['conversations']
52
+ ]}
53
+ for i in range(0, 100, 10)
54
+ ],
55
+ # 3.98 GB, 814,334
56
+ # 300k
57
+ *[
58
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-reasoning-deepseek.jsonl', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
59
+ {'role': 'system', 'content': R1_SYSTEM_PROMPT},
60
+ *r['messages'],
61
+ {'role': 'assistant', 'content': '<think>\n' + (r.get('reasoning') or '') + '\n</think>\n' + (r.get('answer') or '')},
62
+ ]}
63
+ for i in range(0, 100, 10)
64
+ ],
65
+ # 300k
66
+ *[
67
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-reasoning-flash.jsonl', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
68
+ {'role': 'system', 'content': R1_SYSTEM_PROMPT},
69
+ *r['messages'],
70
+ {'role': 'assistant', 'content': '<think>\n' + (r.get('reasoning') or '') + '\n</think>\n' + (r.get('answer') or '')},
71
+ ]}
72
+ for i in range(0, 100, 10)
73
+ ],
74
+ # 21.1 MB, 1,000
75
+ {'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train', 'transform': lambda r: [
76
+ {'role': 'user', 'content': r.get('question') or ''},
77
+ {'role': 'assistant', 'content': '<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n' + (r.get('solution') or '')},
78
+ ]}
79
+ ]
scripts/backup/cpt_base_datasets.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cpt_base_datasets = [
2
+ #
3
+ # stem
4
+ #
5
+ # 1.44 GB, 63,357
6
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': 'train', 'format': lambda n: n['abstract']},
7
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': 'train', 'format': lambda n: n['markdown']},
8
+
9
+ #
10
+ # code
11
+ #
12
+ # 1.62 GB, 1,632,309
13
+ # Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
14
+ {'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': 'train', 'format': '{prompt} {response}'},
15
+
16
+ #
17
+ # misc
18
+ #
19
+ # 472 KB, 5,034
20
+ {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
21
+
22
+ #
23
+ # multilingual
24
+ #
25
+ # 742 MB, 321,697
26
+ *[
27
+ {'kind': 'base', 'path': 'data-silence/sumnews', 'split': split, 'format': lambda n: n[field]}
28
+ for split in ['train', 'test']
29
+ for field in ['title', 'resume', 'news']
30
+ ],
31
+ # 193 MB, 1,141,967
32
+ *[
33
+ {'kind': 'base', 'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
34
+ for name in [
35
+ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
36
+ 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
37
+ 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
38
+ 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
39
+ 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
40
+ 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
41
+ 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
42
+ 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
43
+ 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
44
+ 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
45
+ 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
46
+ 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
47
+ 'zh-Hans', 'zh-Hant', 'zu',
48
+ ]
49
+ ],
50
+
51
+ #
52
+ # general knowledge
53
+ #
54
+ # 3.18 GB, 1,010,500 - uncompressed 6GB
55
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'train', 'format': lambda n: n['text']},
56
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
57
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
58
+ ]
scripts/backup/cpt_base_model.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unsloth import FastLanguageModel
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+
5
+ max_seq_length = 4096
6
+ dtype = torch.bfloat16
7
+ load_in_4bit = True
8
+ model_name = '../out/pretrain-base'
9
+ output_dir = '../out/cpt-base'
10
+
11
+ model, tokenizer = FastLanguageModel.from_pretrained(
12
+ model_name=model_name,
13
+ max_seq_length=max_seq_length,
14
+ dtype=dtype,
15
+ load_in_4bit=load_in_4bit,
16
+ )
17
+
18
+ print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
19
+ tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)
20
+
21
+ print(f'{model=}')
22
+ print(f'{tokenizer=}')
23
+
24
+ model = FastLanguageModel.get_peft_model(
25
+ model,
26
+ r=64, # 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
27
+ target_modules=[
28
+ "q_proj", "k_proj", "v_proj", "o_proj",
29
+ "gate_proj", "up_proj", "down_proj",
30
+ "embed_tokens", "lm_head",
31
+ ], # Add for continual pretraining
32
+ lora_alpha=16,
33
+ lora_dropout=0, # Supports any, but = 0 is optimized
34
+ bias='none', # Supports any, but = "none" is optimized
35
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
36
+ use_gradient_checkpointing='unsloth', # True or "unsloth" for very long context
37
+ random_state=23,
38
+ use_rslora=True, # We support rank stabilized LoRA
39
+ loftq_config=None, # And LoftQ
40
+ )
41
+
42
+ print(f'{model=}')
43
+
44
+ from datasets import concatenate_datasets
45
+ from cpt_base_datasets import cpt_base_datasets
46
+ from cpt_instruct_datasets import cpt_instruct_datasets
47
+ from unsloth_utils import load_text_dataset, load_chat_dataset
48
+
49
+ core_datasets = []
50
+
51
+ for dataset_config in cpt_base_datasets:
52
+ dataset = load_text_dataset(tokenizer, **dataset_config)
53
+ print(f'{dataset=}')
54
+ core_datasets.append(dataset)
55
+
56
+ # for dataset_config in cpt_instruct_datasets:
57
+ # dataset = load_chat_dataset(tokenizer, **dataset_config)
58
+ # print(f'{dataset=}')
59
+ # core_datasets.append(dataset)
60
+
61
+ final_dataset = concatenate_datasets(core_datasets)
62
+ print(f'{final_dataset=}')
63
+
64
+
65
+ from trl import SFTTrainer
66
+ from transformers import TrainingArguments
67
+ from unsloth import is_bfloat16_supported
68
+ from unsloth import UnslothTrainer, UnslothTrainingArguments
69
+
70
+
71
+ trainer = UnslothTrainer(
72
+ model=model,
73
+ tokenizer=tokenizer,
74
+ train_dataset=final_dataset,
75
+ dataset_text_field='text',
76
+ max_seq_length=max_seq_length,
77
+ dataset_num_proc=32,
78
+
79
+ args = UnslothTrainingArguments(
80
+ per_device_train_batch_size=8,
81
+ gradient_accumulation_steps=8,
82
+
83
+ warmup_ratio=0.1,
84
+ num_train_epochs=1,
85
+
86
+ learning_rate=5e-5,
87
+ embedding_learning_rate=5e-6,
88
+
89
+ fp16=not is_bfloat16_supported(),
90
+ bf16=is_bfloat16_supported(),
91
+ logging_steps=1,
92
+ optim='adamw_8bit',
93
+ weight_decay=0.01,
94
+ lr_scheduler_type='cosine',
95
+ seed=23,
96
+ output_dir=output_dir,
97
+ report_to='wandb',
98
+ ),
99
+ )
100
+
101
+ trainer_stats = trainer.train()
scripts/backup/cpt_instruct_datasets.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ roles_map = {
5
+ 'system': 'system',
6
+ 'user': 'user',
7
+ 'human': 'user',
8
+ 'assistant': 'assistant',
9
+ 'gpt': 'assistant',
10
+ 'AI': 'assistant',
11
+ }
12
+
13
+
14
+ cpt_instruct_datasets = [
15
+ #
16
+ # general instructs
17
+ #
18
+ # 1.48 GB, 1,420,909
19
+ # mlabonne/open-perfectblend
20
+ # meta-math/MetaMathQA 395,000
21
+ # openbmb/UltraInteract_sft 288,579
22
+ # HuggingFaceH4/ultrachat_200k 207,865
23
+ # microsoft/orca-math-word-problems-200k 200,035
24
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
25
+ # theblackcat102/evol-codealpaca-v1 111,272
26
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
27
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
28
+ {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [
29
+ {'role': roles_map[m['from']], 'content': m['value']}
30
+ for m in msgs
31
+ ]},
32
+
33
+ # 1.41 GB, 939,343
34
+ # allenai/tulu-3-sft-mixture
35
+ # CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024)
36
+ # FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023)
37
+ # No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023)
38
+ # OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024)
39
+ # Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts
40
+ # Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts
41
+ # Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts
42
+ # Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts
43
+ # Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts
44
+ # NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024)
45
+ # Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024)
46
+ # Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024)
47
+ # Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts
48
+ # Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024)
49
+ # WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024)
50
+ # TableGPT (MIT), 5,000 prompts (Zha et al., 2023)
51
+ # SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024)
52
+ # Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023)
53
+ {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': 'train', 'field': 'messages'},
54
+
55
+ #
56
+ # multilingual instructs
57
+ #
58
+ # 2.48 GB, 5,808,694
59
+ # rombodawg/Everything_Instruct_Multilingual
60
+ # Science:
61
+ # antiven0m/physical-reasoning-dpoScience
62
+ # LawalAfeez/science-dataset
63
+ # Social media:
64
+ # Kyle1668/AG-Tweets
65
+ # euclaise/reddit-instruct-curated
66
+ # General Knowledge:
67
+ # NousResearch/CharacterCodex_Characters
68
+ # jstet/quotes-500k_Famous_Quotes
69
+ # FronkonGames/steam-games-dataset_Video_Games
70
+ # totuta_youtube_subs_howto100M_HowTo
71
+ # Multi-lingual:
72
+ # Amani27/massive_translation_dataset
73
+ # udmurtNLP/udmurt-russian-english-labse
74
+ # grosenthal/latin_english
75
+ # msarmi9/korean-english-multitarget-ted-talks-task
76
+ # HaiderSultanArc/MT-Urdu-English_Translate
77
+ # Garsa3112/ChineseEnglishTranslationDataset
78
+ # Cooking:
79
+ # andrewsiah/se_cooking_preference_sft
80
+ # Hieu-Phamkaggle/food_recipes
81
+ # Writing:
82
+ # shahules786/PoetryFoundationData
83
+ # euclaise/writingprompts
84
+ # qwedsacf/ivypanda-essaysEssay
85
+ # Medicine:
86
+ # keivalya/MedQuad-MedicalQnADataset
87
+ # nuvocare/MSD
88
+ # History:
89
+ # ambrosfitz10k/history_data_v4
90
+ # Law:
91
+ # dzunggg/legal-qa-v1
92
+ # Role-Play:
93
+ # roleplay4/fun_CoupleRP
94
+ # Undi95andrijdavid/roleplay-conversation-sharegpt
95
+ # News:
96
+ # RealTimeData/bbc_news_alltime
97
+ # Coding: (rombodawg/code_bagel)
98
+ # layoric/tiny-codes-alpaca
99
+ # glaiveai/glaive-code-assistant-v3
100
+ # ajibawa-2023/Code-290k-ShareGPT
101
+ # chargoddard/commitpack-ft-instruct-rated
102
+ # iamtarun/code_instructions_120k_alpaca
103
+ # ise-uiuc/Magicoder-Evol-Instruct-110K
104
+ # cognitivecomputations/dolphin-coder
105
+ # nickrosh/Evol-Instruct-Code-80k-v1
106
+ # coseal/CodeUltraFeedback_binarized
107
+ # CyberNative/Code_Vulnerability_Security_DPO
108
+ # Math: (rombodawg/code_bagel)
109
+ # TIGER-Lab/MathInstruct
110
+ # Function calling: (rombodawg/code_bagel)
111
+ # glaiveai/glaive-function-calling-v2
112
+ # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
113
+ # teknium/OpenHermes-2.5
114
+ {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': 'train', 'transform': lambda r: [
115
+ {'role': 'system', 'content': r['instruction']},
116
+ {'role': 'user', 'content': r['input']},
117
+ {'role': 'assistant', 'content': r['output']},
118
+ ]},
119
+ ]
scripts/backup/merge-core-into-base.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ slices:
2
+ # 1
3
+ - sources:
4
+ - model: ../out/pretrain-core-converted/
5
+ layer_range: [0, 1]
6
+ # 2
7
+ - sources:
8
+ - model: ../out/pretrain-core-converted/
9
+ layer_range: [0, 1]
10
+ # 3
11
+ - sources:
12
+ - model: ../out/pretrain-core-converted/
13
+ layer_range: [0, 1]
14
+ # 4
15
+ - sources:
16
+ - model: ../out/pretrain-core-converted/
17
+ layer_range: [0, 1]
18
+ # 5
19
+ - sources:
20
+ - model: ../out/pretrain-core-converted/
21
+ layer_range: [0, 1]
22
+ # 6
23
+ - sources:
24
+ - model: ../out/pretrain-core-converted/
25
+ layer_range: [0, 1]
26
+ # 7
27
+ - sources:
28
+ - model: ../out/pretrain-core-converted/
29
+ layer_range: [0, 1]
30
+ # 8
31
+ - sources:
32
+ - model: ../out/pretrain-core-converted/
33
+ layer_range: [0, 1]
34
+ # 9
35
+ - sources:
36
+ - model: ../out/pretrain-core-converted/
37
+ layer_range: [0, 1]
38
+ # 10
39
+ - sources:
40
+ - model: ../out/pretrain-core-converted/
41
+ layer_range: [0, 1]
42
+ # 11
43
+ - sources:
44
+ - model: ../out/pretrain-core-converted/
45
+ layer_range: [0, 1]
46
+ # 12
47
+ - sources:
48
+ - model: ../out/pretrain-core-converted/
49
+ layer_range: [0, 1]
50
+ # 13
51
+ - sources:
52
+ - model: ../out/pretrain-core-converted/
53
+ layer_range: [0, 1]
54
+ # 14
55
+ - sources:
56
+ - model: ../out/pretrain-core-converted/
57
+ layer_range: [0, 1]
58
+ # 15
59
+ - sources:
60
+ - model: ../out/pretrain-core-converted/
61
+ layer_range: [0, 1]
62
+ # 16
63
+ - sources:
64
+ - model: ../out/pretrain-core-converted/
65
+ layer_range: [0, 1]
66
+ # 17
67
+ - sources:
68
+ - model: ../out/pretrain-core-converted/
69
+ layer_range: [0, 1]
70
+ # 18
71
+ - sources:
72
+ - model: ../out/pretrain-core-converted/
73
+ layer_range: [0, 1]
74
+ # 19
75
+ - sources:
76
+ - model: ../out/pretrain-core-converted/
77
+ layer_range: [0, 1]
78
+ # 20
79
+ - sources:
80
+ - model: ../out/pretrain-core-converted/
81
+ layer_range: [0, 1]
82
+ # 21
83
+ - sources:
84
+ - model: ../out/pretrain-core-converted/
85
+ layer_range: [0, 1]
86
+ # 22
87
+ - sources:
88
+ - model: ../out/pretrain-core-converted/
89
+ layer_range: [0, 1]
90
+ # 23
91
+ - sources:
92
+ - model: ../out/pretrain-core-converted/
93
+ layer_range: [0, 1]
94
+ # 24
95
+ - sources:
96
+ - model: ../out/pretrain-core-converted/
97
+ layer_range: [0, 1]
98
+
99
+ merge_method: passthrough
100
+ dtype: bfloat16
scripts/backup/prepare_pretrain_base_datasets.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ from litgpt.tokenizer import Tokenizer
4
+ from litdata import optimize, TokensLoader, StreamingDataset
5
+ from transformers import AutoTokenizer
6
+
7
+ from utils import tokenize_fn
8
+ from pretrain_base_datasets import pretrain_base_datasets
9
+ from pretrain_instruct_datasets import pretrain_instruct_datasets
10
+ from pretrain_reflection_datasets import pretrain_reflection_datasets
11
+ from pretrain_reasoning_datasets import pretrain_reasoning_datasets
12
+
13
+
14
+ #
15
+ # optimize datasets
16
+ #
17
+ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
18
+ chunk_size = block_size * subchunk_size
19
+ output_dir = f'../pretrain-base-data-{i}-{block_size}-{subchunk_size}'
20
+
21
+ outputs = optimize(
22
+ fn=partial(
23
+ tokenize_fn,
24
+ hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
25
+ tokenizer=Tokenizer('..'),
26
+ ),
27
+ inputs=(
28
+ pretrain_base_datasets +
29
+ pretrain_instruct_datasets +
30
+ pretrain_reflection_datasets +
31
+ pretrain_reasoning_datasets
32
+ ),
33
+ output_dir=output_dir,
34
+ chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
35
+ num_workers=32,
36
+ reorder_files=False,
37
+ ## This is important to inform LitData that we are encoding contiguous 1D array (tokens).
38
+ ## LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
39
+ # item_loader=TokensLoader(block_size=block_size),
40
+ )
41
+
42
+ #
43
+ # total number of chunks in datasets
44
+ #
45
+ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
46
+ chunk_size = block_size * subchunk_size
47
+ input_dir = f'../pretrain-base-data-{i}-{block_size}-{subchunk_size}'
48
+
49
+ dataset = StreamingDataset(
50
+ input_dir=input_dir,
51
+ item_loader=TokensLoader(block_size=block_size),
52
+ )
53
+
54
+ print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
55
+
56
+ # total_tokens = sum(len(data) for data in dataset)
57
+ # print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')
58
+ total_tokens = len(dataset) * block_size
59
+ print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')
scripts/backup/pretrain_base_datasets.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pretrain_base_datasets = [
2
+ #
3
+ # multilingual
4
+ #
5
+ # 3.17 GB, 2,226,907
6
+ *[
7
+ {'kind': 'base', 'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
8
+ for i in range(0, 100, 10)
9
+ ],
10
+ # 1.64 GB, 1,001,000
11
+ *[
12
+ {'kind': 'base', 'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
13
+ for i in range(0, 100, 10)
14
+ ],
15
+ # 3.8 GB, 19,454,996
16
+ *[
17
+ {'kind': 'base', 'path': 'sentence-transformers/parallel-sentences-wikimatrix', 'data_dir': 'all', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['non_english']}
18
+ for i in range(0, 100, 10)
19
+ ],
20
+
21
+ #
22
+ # general knowledge
23
+ #
24
+ # 65.1 MB, 7,819
25
+ {'kind': 'base', 'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
26
+ # 135 MB, 1,795
27
+ {'kind': 'base', 'path': 'open-phi/textbooks', 'format': lambda n: n['markdown']},
28
+ # 631 MB, 111,048
29
+ {'kind': 'base', 'path': 'open-phi/programming_books_llama', 'format': lambda n: n['markdown']},
30
+
31
+ #
32
+ # misc
33
+ #
34
+ # 472 KB, 5,034
35
+ {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
36
+
37
+ #
38
+ # math
39
+ #
40
+ # 12.6 GB, 14M rows
41
+ *[
42
+ {'kind': 'base', 'path': 'nvidia/OpenMathInstruct-2', 'split': f'train[{i}%:{i + 10}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
43
+ for i in range(0, 100, 10)
44
+ ],
45
+
46
+ #
47
+ # stem
48
+ #
49
+ # 1.44 GB, 63,357
50
+ *[
51
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
52
+ for i in range(0, 100, 10)
53
+ ],
54
+ *[
55
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['markdown']}
56
+ for i in range(0, 100, 10)
57
+ ],
58
+
59
+ #
60
+ # code
61
+ #
62
+ # 7.81 GB, ~2,804,025
63
+ *[
64
+ {'kind': 'base', 'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 10}%]', 'format': '{input} {output}'}
65
+ for i in range(0, 100, 10)
66
+ ],
67
+
68
+ #
69
+ # multilingual
70
+ #
71
+ # 742 MB, 321,697
72
+ *[
73
+ {'kind': 'base', 'path': 'data-silence/sumnews', 'split': split, 'format': lambda n: n[field]}
74
+ for split in ['train', 'test']
75
+ for field in ['title', 'resume', 'news']
76
+ ],
77
+ # 193 MB, 1,141,967
78
+ *[
79
+ {'kind': 'base', 'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
80
+ for name in [
81
+ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
82
+ 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
83
+ 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
84
+ 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
85
+ 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
86
+ 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
87
+ 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
88
+ 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
89
+ 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
90
+ 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
91
+ 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
92
+ 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
93
+ 'zh-Hans', 'zh-Hant', 'zu',
94
+ ]
95
+ ],
96
+
97
+ #
98
+ # general knowledge
99
+ #
100
+ # 3.18 GB, 1,010,500 - uncompressed 6GB
101
+ *[
102
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
103
+ for i in range(0, 100, 10)
104
+ ],
105
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
106
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
107
+ ]
scripts/backup/pretrain_instruct_datasets.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ roles_map = {
5
+ 'system': 'system',
6
+ 'user': 'user',
7
+ 'human': 'user',
8
+ 'assistant': 'assistant',
9
+ 'gpt': 'assistant',
10
+ 'AI': 'assistant',
11
+ }
12
+
13
+
14
+ pretrain_instruct_datasets = [
15
+ #
16
+ # general instructs
17
+ #
18
+ # 138 MB, 205,568
19
+ {'kind': 'instruct', 'path': 'CohereForAI/aya_dataset', 'transform': lambda r: [
20
+ {'role': 'user', 'content': r['inputs']},
21
+ {'role': 'assistant', 'content': r['targets']},
22
+ ]},
23
+
24
+ # ~3 GB, 4,976,850
25
+ *[
26
+ {'kind': 'instruct', 'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'transform': lambda r: [
27
+ {'role': 'system', 'content': r['instruction']},
28
+ {'role': 'user', 'content': r['input']},
29
+ {'role': 'assistant', 'content': r['output']},
30
+ ]}
31
+ for name in [
32
+ 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
33
+ 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
34
+ ]
35
+ ],
36
+
37
+ # 1.48 GB, 1,420,909
38
+ # mlabonne/open-perfectblend
39
+ # meta-math/MetaMathQA 395,000
40
+ # openbmb/UltraInteract_sft 288,579
41
+ # HuggingFaceH4/ultrachat_200k 207,865
42
+ # microsoft/orca-math-word-problems-200k 200,035
43
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
44
+ # theblackcat102/evol-codealpaca-v1 111,272
45
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
46
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
47
+ *[
48
+ {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
49
+ {'role': roles_map[m['from']], 'content': m['value']}
50
+ for m in msgs
51
+ ]}
52
+ for i in range(0, 100, 10)
53
+ ],
54
+ # 4.58 GB, 1,752,473
55
+ # arcee-ai/The-Tome
56
+ # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
57
+ # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
58
+ # - jondurbin/airoboros-3.2
59
+ # - gardner/glaive-function-calling-v2-sharegpt
60
+ # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
61
+ # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
62
+ # - cognitivecomputations/ultrainteract_trajectories_sharegpt
63
+ # - cognitivecomputations/SystemChat-2.0
64
+ # - arcee-ai/qwen2-72b-magpie-en
65
+ *[
66
+ {'kind': 'instruct', 'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
67
+ {'role': roles_map[m['from']], 'content': m['value']}
68
+ for m in msgs
69
+ ]}
70
+ for i in range(0, 100, 10)
71
+ ],
72
+ # 2.48 GB, 5,808,694
73
+ # rombodawg/Everything_Instruct_Multilingual
74
+ # Science:
75
+ # antiven0m/physical-reasoning-dpoScience
76
+ # LawalAfeez/science-dataset
77
+ # Social media:
78
+ # Kyle1668/AG-Tweets
79
+ # euclaise/reddit-instruct-curated
80
+ # General Knowledge:
81
+ # NousResearch/CharacterCodex_Characters
82
+ # jstet/quotes-500k_Famous_Quotes
83
+ # FronkonGames/steam-games-dataset_Video_Games
84
+ # totuta_youtube_subs_howto100M_HowTo
85
+ # Multi-lingual:
86
+ # Amani27/massive_translation_dataset
87
+ # udmurtNLP/udmurt-russian-english-labse
88
+ # grosenthal/latin_english
89
+ # msarmi9/korean-english-multitarget-ted-talks-task
90
+ # HaiderSultanArc/MT-Urdu-English_Translate
91
+ # Garsa3112/ChineseEnglishTranslationDataset
92
+ # Cooking:
93
+ # andrewsiah/se_cooking_preference_sft
94
+ # Hieu-Phamkaggle/food_recipes
95
+ # Writing:
96
+ # shahules786/PoetryFoundationData
97
+ # euclaise/writingprompts
98
+ # qwedsacf/ivypanda-essaysEssay
99
+ # Medicine:
100
+ # keivalya/MedQuad-MedicalQnADataset
101
+ # nuvocare/MSD
102
+ # History:
103
+ # ambrosfitz10k/history_data_v4
104
+ # Law:
105
+ # dzunggg/legal-qa-v1
106
+ # Role-Play:
107
+ # roleplay4/fun_CoupleRP
108
+ # Undi95andrijdavid/roleplay-conversation-sharegpt
109
+ # News:
110
+ # RealTimeData/bbc_news_alltime
111
+ # Coding: (rombodawg/code_bagel)
112
+ # layoric/tiny-codes-alpaca
113
+ # glaiveai/glaive-code-assistant-v3
114
+ # ajibawa-2023/Code-290k-ShareGPT
115
+ # chargoddard/commitpack-ft-instruct-rated
116
+ # iamtarun/code_instructions_120k_alpaca
117
+ # ise-uiuc/Magicoder-Evol-Instruct-110K
118
+ # cognitivecomputations/dolphin-coder
119
+ # nickrosh/Evol-Instruct-Code-80k-v1
120
+ # coseal/CodeUltraFeedback_binarized
121
+ # CyberNative/Code_Vulnerability_Security_DPO
122
+ # Math: (rombodawg/code_bagel)
123
+ # TIGER-Lab/MathInstruct
124
+ # Function calling: (rombodawg/code_bagel)
125
+ # glaiveai/glaive-function-calling-v2
126
+ # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
127
+ # teknium/OpenHermes-2.5
128
+ *[
129
+ {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
130
+ {'role': 'system', 'content': r['instruction']},
131
+ {'role': 'user', 'content': r['input']},
132
+ {'role': 'assistant', 'content': r['output']},
133
+ ]}
134
+ for i in range(0, 100, 10)
135
+ ],
136
+ # 1.41 GB, 939,343
137
+ # allenai/tulu-3-sft-mixture
138
+ # CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024)
139
+ # FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023)
140
+ # No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023)
141
+ # OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024)
142
+ # Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts
143
+ # Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts
144
+ # Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts
145
+ # Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts
146
+ # Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts
147
+ # NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024)
148
+ # Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024)
149
+ # Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024)
150
+ # Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts
151
+ # Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024)
152
+ # WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024)
153
+ # TableGPT (MIT), 5,000 prompts (Zha et al., 2023)
154
+ # SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024)
155
+ # Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023)
156
+ *[
157
+ {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
158
+ for i in range(0, 100, 10)
159
+ ],
160
+
161
+ #
162
+ # tool/function calling
163
+ #
164
+ # 65.7 MB, 11,578
165
+ {'kind': 'instruct', 'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [
166
+ {'role': roles_map[m['from']], 'content': m['value']}
167
+ for m in msgs
168
+ ]},
169
+
170
+ #
171
+ # agent
172
+ #
173
+ # 1.51 GB, 485,874
174
+ *[
175
+ {'kind': 'instruct', 'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
176
+ {'role': roles_map[m['from']], 'content': m['value']}
177
+ for m in msgs
178
+ ]}
179
+ for i in range(0, 100, 10)
180
+ ],
181
+ # 2.21 GB, 1,046,410
182
+ *[
183
+ {'kind': 'instruct', 'path': 'microsoft/orca-agentinstruct-1M-v1', 'split': split, 'field': 'messages', 'transform': lambda msgs: json.loads(msgs)}
184
+ for split in [
185
+ 'creative_content', 'text_modification', 'struct2text_flow', 'rc', 'rag',
186
+ 'text_extraction', 'mcq', 'follow_up', 'analytical_reasoning', 'fermi', 'fs_cot_flow',
187
+ 'code_', 'brain_teaser', 'text_classification', 'open_domain_qa',
188
+ ]
189
+ ],
190
+
191
+ #
192
+ # general instructs
193
+ #
194
+ # 1.52 GB, 214k (3.98 GB, 814,334)
195
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-nonreasoning.jsonl', 'split': 'train', 'field': 'messages'},
196
+ # 4.15 GB, 2,197,730
197
+ {'kind': 'instruct', 'path': 'HuggingFaceTB/smoltalk', 'name': 'all', 'field': 'messages'},
198
+ ]
scripts/backup/pretrain_reasoning_datasets.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+
11
+ pretrain_reasoning_datasets = [
12
+ #
13
+ # basic reasoning
14
+ #
15
+ # 10.8 MB, 15,770
16
+ {'kind': 'instruct', 'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'transform': lambda r: [
17
+ {'role': 'user', 'content': r['Prompt']},
18
+ {'role': 'assistant', 'content': r['Step-by-step reasoning'] + '\n' + r['Solution']},
19
+ ]},
20
+ # 1.23 GB, 859,594
21
+ *[
22
+ {'kind': 'instruct', 'path': 'AI-MO/NuminaMath-CoT', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
23
+ for i in range(0, 100, 10)
24
+ ],
25
+ # 148 MB, 72,540
26
+ *[
27
+ {'kind': 'instruct', 'path': 'AI-MO/NuminaMath-TIR', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
28
+ for i in range(0, 100, 10)
29
+ ],
30
+
31
+ #
32
+ # math reasoning
33
+ #
34
+ # 1.79 MB, 3,963
35
+ {'kind': 'instruct', 'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'transform': lambda r: [
36
+ {'role': 'user', 'content': r['informal_statement']},
37
+ {'role': 'assistant', 'content': r['informal_proof'] + '\n' + r['formal_proof']},
38
+ ]},
39
+ # 307 MB, 19,944
40
+ {'kind': 'instruct', 'path': 'KingNish/reasoning-base-20k', 'transform': lambda r: [
41
+ {'role': 'user', 'content': r['user']},
42
+ {'role': 'assistant', 'content': r['reasoning'] + '\n' + r['assistant']},
43
+ ]},
44
+ # 9.45 MB, 10,000
45
+ {'kind': 'instruct', 'path': 'Aarushhh/math-reasoning-10k', 'transform': lambda r: [
46
+ {'role': 'user', 'content': r['problem']},
47
+ {'role': 'assistant', 'content': r['plan'] + '\n' + r['solution']},
48
+ ]},
49
+
50
+ #
51
+ # cot reasoning
52
+ #
53
+ # 11.7 GB, 1,850,809
54
+ *[
55
+ {'kind': 'instruct', 'path': 'ServiceNow-AI/R1-Distill-SFT', 'data_dir': 'v0', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
56
+ {'role': 'user', 'content': r['problem']},
57
+ {'role': 'assistant', 'content': r['reannotated_assistant_content']},
58
+ ]}
59
+ for i in range(0, 100, 10)
60
+ ],
61
+ *[
62
+ {'kind': 'instruct', 'path': 'ServiceNow-AI/R1-Distill-SFT', 'data_dir': 'v1', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: r['reannotated_messages']}
63
+ for i in range(0, 100, 10)
64
+ ],
65
+ # 3.85 GB, 300k (3.98 GB, 814,334)
66
+ *[
67
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-reasoning-deepseek.jsonl', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
68
+ *r['messages'],
69
+ # {'role': 'assistant', 'content': (('<think>\n' + r['reasoning'] + '\n</think>\n') if r.get('reasoning') else '') + r['answer']},
70
+ {'role': 'assistant', 'content': (r.get('reasoning') or '') + (r.get('answer') or '')},
71
+ ]}
72
+ for i in range(0, 100, 10)
73
+ ],
74
+ # 3.49 GB, 300k (3.98 GB, 814,334)
75
+ *[
76
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-reasoning-flash.jsonl', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
77
+ *r['messages'],
78
+ # {'role': 'assistant', 'content': (('<think>\n' + r['reasoning'] + '\n</think>\n') if r.get('reasoning') else '') + r['answer']},
79
+ {'role': 'assistant', 'content': (r.get('reasoning') or '') + (r.get('answer') or '')},
80
+ ]}
81
+ for i in range(0, 100, 10)
82
+ ],
83
+ # 1.08 GB, 113,957
84
+ {'kind': 'instruct', 'path': 'open-thoughts/OpenThoughts-114k', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [
85
+ {'role': roles_map[m['from']], 'content': m['value']}
86
+ for m in msgs
87
+ ]},
88
+ # 384 MB, 77,685
89
+ {'kind': 'instruct', 'path': 'O1-OPEN/OpenO1-SFT', 'split': 'train', 'transform': lambda r: [
90
+ {'role': 'user', 'content': r['instruction']},
91
+ {'role': 'assistant', 'content': r['output']},
92
+ ]},
93
+ # 6.88 MB, 1,000
94
+ {'kind': 'instruct', 'path': 'simplescaling/s1K', 'split': 'train', 'transform': lambda r: [
95
+ {'role': 'user', 'content': r['question']},
96
+ {'role': 'assistant', 'content': '<think>\n' + '\n'.join(r['thinking_trajectories']) + '\n</think>\n' + r['solution']},
97
+ ]},
98
+ ]
scripts/backup/pretrain_reflection_datasets.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+
11
+ pretrain_reflection_datasets = [
12
+ #
13
+ # reflection
14
+ #
15
+ # 4.17 MB, 1,000
16
+ {'kind': 'instruct', 'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [
17
+ {'role': 'system', 'content': r['system']},
18
+ {'role': 'user', 'content': r['prompt']},
19
+ {'role': 'assistant', 'content': r['response']},
20
+ ]},
21
+ # 12.4 MB, 3,000
22
+ {'kind': 'instruct', 'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [
23
+ {'role': 'system', 'content': r['system']},
24
+ {'role': 'user', 'content': r['prompt']},
25
+ {'role': 'assistant', 'content': r['response']},
26
+ ]},
27
+ # 70.8 MB, 36,549
28
+ {'kind': 'instruct', 'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
29
+ {'role': 'system', 'content': r['system']},
30
+ {'role': 'user', 'content': r['prompt']},
31
+ {'role': 'assistant', 'content': r['response']},
32
+ ]},
33
+ # 30.6 MB, 25,391
34
+ {'kind': 'instruct', 'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [
35
+ r['system'][0],
36
+ {'role': 'user', 'content': r['input']},
37
+ {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']},
38
+ ]},
39
+ ]
scripts/backup/unsloth_utils.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Iterator, Callable, Any
2
+
3
+ import torch
4
+ from datasets import load_dataset, concatenate_datasets
5
+ from transformers import AutoTokenizer
6
+
7
+
8
+ def load_text_dataset(tokenizer: AutoTokenizer,
9
+ kind: str,
10
+ path: str,
11
+ name: Optional[str]=None,
12
+ data_dir: Optional[str]=None,
13
+ data_files: Optional[str]=None,
14
+ keep_in_memory: bool=False,
15
+ revision: Optional[str]=None,
16
+ split: str='train',
17
+ num_proc: Optional[int]=None,
18
+ format: Optional[Callable|str]=None) -> Any:
19
+ assert isinstance(format, str) or callable(format), f'{path=} {format=}'
20
+ assert kind == 'base'
21
+
22
+ dataset = load_dataset(path=path,
23
+ name=name,
24
+ data_dir=data_dir,
25
+ data_files=data_files,
26
+ keep_in_memory=keep_in_memory,
27
+ revision=revision,
28
+ split=split,
29
+ trust_remote_code=True,
30
+ num_proc=num_proc)
31
+
32
+ EOS_TOKEN = tokenizer.eos_token
33
+
34
+ def format_dataset(batch):
35
+ nonlocal EOS_TOKEN
36
+ nonlocal format
37
+ texts: list = []
38
+ rows = [dict(zip(batch.keys(), values)) for values in zip(*batch.values())]
39
+
40
+ if callable(format):
41
+ for row in rows:
42
+ # print(f'{row=}')
43
+ text = format(row)
44
+
45
+ if not text:
46
+ text = '[NONE]'
47
+
48
+ text += EOS_TOKEN
49
+ texts.append(text)
50
+ else:
51
+ for row in rows:
52
+ # print(f'{row=}')
53
+ text = format.format(**row)
54
+
55
+ if not text:
56
+ text = '[NONE]'
57
+
58
+ text += EOS_TOKEN
59
+ texts.append(text)
60
+
61
+ return {'text': texts}
62
+
63
+ dataset = dataset.map(format_dataset, batched=True)
64
+ return dataset
65
+
66
+
67
+ def load_chat_dataset(tokenizer: AutoTokenizer,
68
+ kind: str,
69
+ path: str,
70
+ name: Optional[str]=None,
71
+ data_dir: Optional[str]=None,
72
+ data_files: Optional[str]=None,
73
+ keep_in_memory: bool=False,
74
+ revision: Optional[str]=None,
75
+ split: str='train',
76
+ num_proc: Optional[int]=None,
77
+ field: Optional[str]=None,
78
+ transform: Optional[Callable]=None) -> Any:
79
+ assert kind == 'instruct'
80
+
81
+ dataset = load_dataset(path=path,
82
+ name=name,
83
+ data_dir=data_dir,
84
+ data_files=data_files,
85
+ keep_in_memory=keep_in_memory,
86
+ revision=revision,
87
+ split=split,
88
+ trust_remote_code=True,
89
+ num_proc=num_proc)
90
+
91
+ EOS_TOKEN = tokenizer.eos_token
92
+
93
+ def format_dataset(batch):
94
+ nonlocal EOS_TOKEN
95
+ nonlocal tokenizer
96
+ nonlocal field
97
+ nonlocal transform
98
+ texts: list = []
99
+ rows = [dict(zip(batch.keys(), values)) for values in zip(*batch.values())]
100
+
101
+ if callable(transform):
102
+ for row in rows:
103
+ if field:
104
+ messages = transform(row[field])
105
+ else:
106
+ messages = transform(row)
107
+
108
+ text = tokenizer.apply_chat_template(messages, tokenize=False)
109
+ text += EOS_TOKEN
110
+ texts.append(text)
111
+ else:
112
+ for row in rows:
113
+ if field:
114
+ messages = row[field]
115
+ else:
116
+ raise ValueError(field)
117
+
118
+ text = tokenizer.apply_chat_template(messages, tokenize=False)
119
+ text += EOS_TOKEN
120
+ texts.append(text)
121
+
122
+ return {'text': texts}
123
+
124
+ dataset = dataset.map(format_dataset, batched=True)
125
+ return dataset
scripts/core_base_datasets.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ core_base_datasets = [
2
+ #
3
+ # multilingual
4
+ #
5
+ # 3.17 GB, 2,226,907
6
+ *[
7
+ {'kind': 'base', 'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
8
+ for i in range(0, 100, 5)
9
+ ],
10
+ # 1.64 GB, 1,001,000
11
+ *[
12
+ {'kind': 'base', 'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
13
+ for i in range(0, 100, 5)
14
+ ],
15
+ # 742 MB, 321,697
16
+ *[
17
+ {'kind': 'base', 'path': 'data-silence/sumnews', 'split': split, 'format': lambda n: n[field]}
18
+ for split in ['train', 'test']
19
+ for field in ['title', 'resume', 'news']
20
+ ],
21
+ # 193 MB, 1,141,967
22
+ *[
23
+ {'kind': 'base', 'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
24
+ for name in [
25
+ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
26
+ 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
27
+ 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
28
+ 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
29
+ 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
30
+ 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
31
+ 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
32
+ 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
33
+ 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
34
+ 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
35
+ 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
36
+ 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
37
+ 'zh-Hans', 'zh-Hant', 'zu',
38
+ ]
39
+ ],
40
+
41
+ #
42
+ # misc
43
+ #
44
+ # 472 KB, 5,034
45
+ {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
46
+
47
+ #
48
+ # stem
49
+ #
50
+ # 12.2 MB, 500,000
51
+ {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
52
+ {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
53
+ # 125 MB, 1,000,000
54
+ {'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
55
+
56
+ # 1.44 GB, 63,357
57
+ *[
58
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
59
+ for i in range(0, 100, 10)
60
+ ],
61
+ *[
62
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['markdown']}
63
+ for i in range(0, 100, 10)
64
+ ],
65
+
66
+ #
67
+ # code
68
+ #
69
+ # 36.8 MB, 79,013
70
+ # Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
71
+ {'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
72
+ # 1.62 GB, 1,632,309
73
+ # Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
74
+ *[
75
+ {'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
76
+ for i in range(0, 100, 10)
77
+ ],
78
+
79
+ #
80
+ # general knowledge
81
+ #
82
+ # 3.18 GB, 1,010,500 - uncompressed 6GB
83
+ *[
84
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
85
+ for i in range(0, 100, 5)
86
+ ],
87
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
88
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
89
+ ]
scripts/core_instruct_datasets.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+ R1_SYSTEM_PROMPT = '''\
11
+ You are an AI assistant.
12
+
13
+ Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
14
+
15
+ Formatting Requirements:
16
+ - Always structure your replies using: <think>{reasoning}</think>{answer}
17
+ - The <think></think> block should contain at least six reasoning steps when applicable.
18
+ - If the answer requires minimal thought, the <think></think> block may be left empty.
19
+ - The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
20
+ - If you notice that you have engaged in circular reasoning or repetition, immediately terminate {reasoning} with a </think> and proceed to the {answer}
21
+
22
+ Response Guidelines:
23
+ - Detailed and Structured: Use rich Markdown formatting for clarity and readability.
24
+ - Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
25
+ - Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
26
+ - Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
27
+ - Maintain a professional, intelligent, and analytical tone in all interactions.'''
28
+
29
+ core_instruct_datasets = [
30
+ # 65.7 MB, 11,578
31
+ # 1.89k
32
+ {'kind': 'instruct', 'path': 'NousResearch/hermes-function-calling-v1', 'data_files': 'func-calling-singleturn.json', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [
33
+ {'role': roles_map[m['from']], 'content': m['value']}
34
+ for m in msgs
35
+ ]},
36
+
37
+ # 21.1 MB, 1,000
38
+ {'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train', 'transform': lambda r: [
39
+ {'role': 'system', 'content': R1_SYSTEM_PROMPT},
40
+ {'role': 'user', 'content': r.get('question') or ''},
41
+ {'role': 'assistant', 'content': '<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n' + (r.get('solution') or '')},
42
+ ]}
43
+ ]
scripts/prepare_core_datasets.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ from transformers import AutoTokenizer
4
+ from litgpt.tokenizer import Tokenizer
5
+ from litdata import optimize, TokensLoader, StreamingDataset
6
+
7
+ from utils import tokenize_fn
8
+ from core_base_datasets import core_base_datasets
9
+ from core_instruct_datasets import core_instruct_datasets
10
+
11
+
12
+ tokenizer_path = '../tokenizer'
13
+
14
+ seqs = [
15
+ (0, 1073741824, 4097, 4000),
16
+ ]
17
+
18
+ #
19
+ # optimize datasets
20
+ #
21
+ for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
22
+ chunk_size = block_size * subchunk_size
23
+ output_dir = f'../core-data-{i}-{min_len}-{max_len}-{block_size}-{subchunk_size}'
24
+
25
+ outputs = optimize(
26
+ fn=partial(
27
+ tokenize_fn,
28
+ min_len=min_len,
29
+ max_len=max_len,
30
+ hf_tokenizer=AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, use_fast=True),
31
+ tokenizer=Tokenizer(tokenizer_path),
32
+ ),
33
+ inputs=core_base_datasets + core_instruct_datasets,
34
+ output_dir=output_dir,
35
+ chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
36
+ num_workers=32,
37
+ reorder_files=False,
38
+ ## This is important to inform LitData that we are encoding contiguous 1D array (tokens).
39
+ ## LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
40
+ # item_loader=TokensLoader(block_size=block_size),
41
+ )
42
+
43
+ #
44
+ # total number of chunks in datasets
45
+ #
46
+ for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
47
+ chunk_size = block_size * subchunk_size
48
+ input_dir = f'../core-data-{i}-{min_len}-{max_len}-{block_size}-{subchunk_size}'
49
+
50
+ dataset = StreamingDataset(
51
+ input_dir=input_dir,
52
+ item_loader=TokensLoader(block_size=block_size),
53
+ )
54
+
55
+ print(f'{i=}, {min_len=}, {max_len=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
56
+ total_tokens = len(dataset) * block_size
57
+ print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')
58
+ print()
scripts/pretrain_core_model.yaml ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
2
+ # ``model_config``. (type: Optional[str], default: null)
3
+ model_name: 'tangled-alpha-0.8-core'
4
+
5
+ # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
6
+ # ``model_config``. (type: Optional[Config], default: null)
7
+ model_config:
8
+ name: 'tangled-alpha-0.8-core'
9
+ block_size: 131072
10
+ vocab_size: 131072
11
+ padded_vocab_size: 131072
12
+ n_layer: 32
13
+ n_head: 4
14
+ n_embd: 512
15
+ n_query_groups: 4
16
+ rotary_percentage: 1.0
17
+ parallel_residual: False
18
+ bias: False
19
+ norm_class_name: "RMSNorm"
20
+ mlp_class_name: "LLaMAMLP"
21
+ intermediate_size: 2048 # n_embd * 4
22
+ norm_eps: 1e-5
23
+ rope_base: 10000
24
+ head_size: 128 # n_embd / n_head
25
+
26
+ # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
27
+ # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
28
+ out_dir: "../out/pretrain-core/"
29
+
30
+ # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
31
+ # precision: bf16-mixed
32
+ precision: bf16-true
33
+
34
+ # Optional path to a checkpoint directory to initialize the model from.
35
+ # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
36
+ initial_checkpoint_dir:
37
+
38
+ # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
39
+ # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
40
+ # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
41
+ # (type: Union[bool, Literal["auto"], Path], default: False)
42
+ resume: "auto"
43
+
44
+ # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
45
+ data:
46
+ class_path: LitData
47
+
48
+ init_args:
49
+ data_path: "../core-data-0-0-1073741824-4097-4000/"
50
+ num_workers: 32
51
+
52
+ # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
53
+ train:
54
+ # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
55
+ save_interval: 50
56
+
57
+ # Number of iterations between logging calls (type: int, default: 1)
58
+ log_interval: 1
59
+
60
+ # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
61
+ global_batch_size: 512
62
+
63
+ # Number of samples per data-parallel rank (type: int, default: 4)
64
+ micro_batch_size: 2
65
+
66
+ # Number of iterations with learning rate warmup active (type: int, default: 2000)
67
+ lr_warmup_steps: 500
68
+
69
+ # Number of epochs to train on (type: Optional[int], default: null)
70
+ epochs:
71
+
72
+ # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
73
+ max_tokens: 5274490091
74
+
75
+ # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
76
+ max_steps:
77
+
78
+ # Limits the length of samples. Off by default (type: Optional[int], default: null)
79
+ max_seq_length: 4096
80
+
81
+ # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
82
+ tie_embeddings: true
83
+
84
+ # (type: Optional[float], default: 1.0)
85
+ max_norm: 1.0
86
+
87
+ # (type: float, default: 4e-05)
88
+ min_lr: 3e-5
89
+
90
+ # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
91
+ eval:
92
+ # Number of optimizer steps between evaluation calls (type: int, default: 1000)
93
+ interval: 50
94
+
95
+ # Number of tokens to generate (type: Optional[int], default: null)
96
+ max_new_tokens:
97
+
98
+ # Number of iterations (type: int, default: 100)
99
+ max_iters: 100
100
+
101
+ # Whether to evaluate on the validation set at the beginning of the training
102
+ initial_validation: false
103
+
104
+ # Whether to evaluate on the validation set at the end the training
105
+ final_validation: true
106
+
107
+ # Optimizer-related arguments
108
+
109
+ # optimizer:
110
+ # class_path: torch.optim.AdamW
111
+ # # class_path: torchao.prototype.low_bit_optim.AdamW8bit
112
+ # # class_path: torchao.prototype.low_bit_optim.AdamW4bit
113
+ # # class_path: bitsandbytes.optim.AdamW8bit
114
+ # # class_path: bitsandbytes.optim.PagedAdamW8bit
115
+ # init_args:
116
+ # # (type: float, default: 0.001)
117
+ # lr: 3e-4
118
+ # # (type: float, default: 0.01)
119
+ # weight_decay: 0.01
120
+ # # (type: tuple, default: (0.9,0.999))
121
+ # betas:
122
+ # - 0.9
123
+ # - 0.999
124
+
125
+ # optimizer:
126
+ # class_path: sophia_opt.SophiaG
127
+ # init_args:
128
+ # lr: 3e-4
129
+ # betas:
130
+ # - 0.9
131
+ # - 0.95
132
+ # rho: 0.05
133
+ # weight_decay: 0.1
134
+
135
+ # optimizer:
136
+ # class_path: grokadamw.GrokAdamW
137
+ # init_args:
138
+ # # (type: float, default: 0.001)
139
+ # lr: 3e-4
140
+ # # (type: float, default: 0.01)
141
+ # weight_decay: 1e-2
142
+ # # (type: tuple, default: (0.9,0.999))
143
+ # betas:
144
+ # - 0.9
145
+ # - 0.999
146
+
147
+ optimizer:
148
+ class_path: muon.Muon
149
+ init_args:
150
+ lr: 3e-4
151
+ weight_decay: 0.01
152
+
153
+ # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
154
+ devices: auto
155
+
156
+ # How many nodes to use. (type: int, default: 1)
157
+ num_nodes: 1
158
+
159
+ # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
160
+ # module require this. (type: Optional[Path], default: null)
161
+ tokenizer_dir: "../tokenizer"
162
+
163
+ # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
164
+ logger_name: "wandb"
165
+
166
+ # The random seed to use for reproducibility. (type: int, default: 42)
167
+ seed: 23
scripts/requirements.in ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
2
+ torch>=2.5.0,<2.6.0
3
+ numpy<2.0
4
+
5
+ tqdm
6
+ Pillow
7
+ datasets
8
+ jinja2
9
+ transformers
10
+ wandb
11
+ litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
12
+ mergekit @ git+https://github.com/arcee-ai/mergekit.git
13
+ # torchao
14
+ # bitsandbytes
15
+ # grokadamw
16
+ # sophia-opt
17
+ # dolphinflow
18
+ muon @ git+https://github.com/cognitivecomputations/Muon
19
+ # unsloth
20
+ lm_eval[ifeval,math]
scripts/train_tokenizer.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+ from transformers import PreTrainedTokenizerFast
5
+ from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
6
+ from tokenizers.models import BPE
7
+ from tokenizers.trainers import BpeTrainer
8
+
9
+ from utils import batch_dataset_iterator
10
+ from core_base_datasets import core_base_datasets
11
+ from core_instruct_datasets import core_instruct_datasets
12
+
13
+
14
+ tokenizer_path = '../tokenizer'
15
+
16
+ if os.path.exists(tokenizer_path):
17
+ shutil.rmtree(tokenizer_path)
18
+
19
+ os.makedirs(tokenizer_path, exist_ok=True)
20
+
21
+ #
22
+ # special_tokens
23
+ #
24
+ bos_token = '<|endoftext|>'
25
+ eos_token = '<|im_end|>'
26
+ pad_token = '<|pad|>'
27
+ unk_token = '<|unk|>'
28
+
29
+ special_tokens = [
30
+ bos_token,
31
+ eos_token,
32
+ pad_token,
33
+ unk_token,
34
+ '<|im_start|>',
35
+ '<|im_sep|>',
36
+ 'system',
37
+ 'user',
38
+ 'assistant',
39
+ '<tools>',
40
+ '</tools>',
41
+ '<tool>',
42
+ '</tool>',
43
+ '<tool_call>',
44
+ '</tool_call>',
45
+ '<tool_response>',
46
+ '</tool_response>',
47
+ '<think>',
48
+ '</think>',
49
+ ]
50
+
51
+ for i in range(64 - len(special_tokens)):
52
+ special_tokens.append(f'<|reserved_{i}|>')
53
+
54
+ #
55
+ # BPE Tokenizer
56
+ #
57
+ bpe = BPE(unk_token=None, byte_fallback=True)
58
+ tokenizer = Tokenizer(bpe)
59
+
60
+ # normalizer
61
+ tokenizer.normalizer = None
62
+
63
+ # pre-tokenizer
64
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
65
+
66
+ # post-processor
67
+ tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True)
68
+
69
+ # decoder
70
+ tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
71
+
72
+ #
73
+ # BPE Trainer
74
+ #
75
+ trainer = BpeTrainer(
76
+ vocab_size=131072, # 128 * 1024
77
+ min_frequency=3,
78
+ special_tokens=special_tokens,
79
+ max_token_length=16,
80
+ )
81
+
82
+ tokenizer_datasets = core_base_datasets + core_instruct_datasets
83
+
84
+ tokenizer.train_from_iterator(
85
+ (batch_dataset_iterator(n) for n in tokenizer_datasets),
86
+ trainer,
87
+ )
88
+
89
+ tokenizer.save(os.path.join(tokenizer_path, 'tokenizer.json'))
90
+ tokenizer.model.save(tokenizer_path)
91
+
92
+ #
93
+ # PreTrainedTokenizerFast
94
+ #
95
+ CHAT_TEMPLATE = (
96
+ "{% for message in messages %}"
97
+ "{{'<|im_start|>' + message['role'] + '<|im_sep|>' + message['content'] + '<|im_end|>'}}"
98
+ "{% endfor %}"
99
+
100
+ "{% if add_generation_prompt %}"
101
+ "{{ '<|im_start|>assistant<|im_sep|>' }}"
102
+ "{% endif %}"
103
+ )
104
+
105
+ fast_tokenizer = PreTrainedTokenizerFast(
106
+ tokenizer_object=tokenizer,
107
+ chat_template=CHAT_TEMPLATE,
108
+ bos_token=bos_token,
109
+ eos_token=eos_token,
110
+ pad_token=pad_token,
111
+ unk_token=unk_token,
112
+ clean_up_tokenization_spaces=False,
113
+ )
114
+
115
+ fast_tokenizer.save_pretrained(tokenizer_path)
scripts/utils.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from typing import Optional, Iterator, Callable
3
+
4
+ import torch
5
+ from datasets import load_dataset
6
+ from litgpt.tokenizer import Tokenizer
7
+ from transformers import AutoTokenizer
8
+
9
+
10
+ def batch_text_iterator(kind: str,
11
+ path: str,
12
+ name: Optional[str]=None,
13
+ data_dir: Optional[str]=None,
14
+ data_files: Optional[str]=None,
15
+ keep_in_memory: bool=False,
16
+ revision: Optional[str]=None,
17
+ split: str='train',
18
+ num_proc: Optional[int]=None,
19
+ format: Optional[Callable|str]=None) -> Iterator[str]:
20
+ assert isinstance(format, str) or callable(format), f'{path=} {format=}'
21
+ assert kind == 'base'
22
+
23
+ dataset = load_dataset(path=path,
24
+ name=name,
25
+ data_dir=data_dir,
26
+ data_files=data_files,
27
+ keep_in_memory=keep_in_memory,
28
+ revision=revision,
29
+ split=split,
30
+ trust_remote_code=True,
31
+ num_proc=num_proc)
32
+
33
+ if callable(format):
34
+ for row in dataset:
35
+ text = format(row)
36
+
37
+ if not text:
38
+ continue
39
+
40
+ yield text
41
+ else:
42
+ for row in dataset:
43
+ text = format.format(**row)
44
+
45
+ if not text:
46
+ continue
47
+
48
+ yield text
49
+
50
+ del dataset
51
+ gc.collect()
52
+
53
+
54
+ def batch_chat_iterator(kind: str,
55
+ path: str,
56
+ name: Optional[str]=None,
57
+ data_dir: Optional[str]=None,
58
+ data_files: Optional[str]=None,
59
+ keep_in_memory: bool=False,
60
+ revision: Optional[str]=None,
61
+ split: str='train',
62
+ num_proc: Optional[int]=None,
63
+ field: Optional[str]=None,
64
+ transform: Optional[Callable]=None) -> Iterator[list[dict[str, str]]]:
65
+ assert kind == 'instruct'
66
+
67
+ dataset = load_dataset(path=path,
68
+ name=name,
69
+ data_dir=data_dir,
70
+ data_files=data_files,
71
+ keep_in_memory=keep_in_memory,
72
+ revision=revision,
73
+ split=split,
74
+ trust_remote_code=True,
75
+ num_proc=num_proc)
76
+
77
+ if callable(transform):
78
+ for row in dataset:
79
+ if field:
80
+ messages = transform(row[field])
81
+ else:
82
+ messages = transform(row)
83
+
84
+ if not messages:
85
+ continue
86
+
87
+ yield messages
88
+ else:
89
+ for row in dataset:
90
+ if field:
91
+ messages = row[field]
92
+ else:
93
+ raise ValueError(field)
94
+
95
+ if not messages:
96
+ continue
97
+
98
+ yield messages
99
+
100
+ del dataset
101
+ gc.collect()
102
+
103
+
104
+ # NOTE: used only by tokenizer trainer
105
+ def batch_dataset_iterator(dataset_config: dict) -> Iterator[str]:
106
+ if dataset_config['kind'] == 'base':
107
+ for text in batch_text_iterator(**dataset_config):
108
+ yield text
109
+ elif dataset_config['kind'] == 'instruct':
110
+ for messages in batch_chat_iterator(**dataset_config):
111
+ text = '\n'.join(n['content'] for n in messages)
112
+ yield text
113
+
114
+
115
+ def tokenize_text_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
116
+ for text in batch_text_iterator(**dataset_config):
117
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
118
+ yield text_ids
119
+
120
+
121
+ def tokenize_chat_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
122
+ for messages in batch_chat_iterator(**dataset_config):
123
+ text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
124
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
125
+ yield text_ids
126
+
127
+
128
+ def tokenize_fn(dataset_config: dict, min_len: int, max_len: int, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
129
+ if dataset_config['kind'] == 'base':
130
+ for text in batch_text_iterator(**dataset_config):
131
+ try:
132
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
133
+ except Exception as e:
134
+ print(f'Skip base raw: {e=} {type(text)=} {text=}')
135
+ continue
136
+
137
+ if min_len <= len(text_ids) <= max_len:
138
+ yield text_ids
139
+ elif dataset_config['kind'] == 'instruct':
140
+ for messages in batch_chat_iterator(**dataset_config):
141
+ try:
142
+ text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
143
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
144
+ except Exception as e:
145
+ print(f'Skip instruct row: {e=} {type(messages)=} {messages=}')
146
+ continue
147
+
148
+ if min_len <= len(text_ids) <= max_len:
149
+ yield text_ids
150
+ else:
151
+ raise ValueError(dataset_config['kind'])
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|im_end|>",
4
+ "pad_token": "<|pad|>",
5
+ "unk_token": "<|unk|>"
6
+ }
tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a45777956cfe9ce422244a4cdfafd3aacf7eb4490bde3034fec3dda9c1a98ab
3
+ size 9845205
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|im_end|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<|pad|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<|unk|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<|im_start|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "<|im_sep|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "system",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "user",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "assistant",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "9": {
76
+ "content": "<tools>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "10": {
84
+ "content": "</tools>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "11": {
92
+ "content": "<tool>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "12": {
100
+ "content": "</tool>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "13": {
108
+ "content": "<tool_call>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "14": {
116
+ "content": "</tool_call>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "15": {
124
+ "content": "<tool_response>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "16": {
132
+ "content": "</tool_response>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "17": {
140
+ "content": "<think>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "18": {
148
+ "content": "</think>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "19": {
156
+ "content": "<|reserved_0|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "20": {
164
+ "content": "<|reserved_1|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "21": {
172
+ "content": "<|reserved_2|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "22": {
180
+ "content": "<|reserved_3|>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "23": {
188
+ "content": "<|reserved_4|>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "24": {
196
+ "content": "<|reserved_5|>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "25": {
204
+ "content": "<|reserved_6|>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "26": {
212
+ "content": "<|reserved_7|>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "27": {
220
+ "content": "<|reserved_8|>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "28": {
228
+ "content": "<|reserved_9|>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "29": {
236
+ "content": "<|reserved_10|>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "30": {
244
+ "content": "<|reserved_11|>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "31": {
252
+ "content": "<|reserved_12|>",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "32": {
260
+ "content": "<|reserved_13|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "33": {
268
+ "content": "<|reserved_14|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "34": {
276
+ "content": "<|reserved_15|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "35": {
284
+ "content": "<|reserved_16|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "36": {
292
+ "content": "<|reserved_17|>",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "37": {
300
+ "content": "<|reserved_18|>",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "38": {
308
+ "content": "<|reserved_19|>",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "39": {
316
+ "content": "<|reserved_20|>",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "40": {
324
+ "content": "<|reserved_21|>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "41": {
332
+ "content": "<|reserved_22|>",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "42": {
340
+ "content": "<|reserved_23|>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "43": {
348
+ "content": "<|reserved_24|>",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "44": {
356
+ "content": "<|reserved_25|>",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "45": {
364
+ "content": "<|reserved_26|>",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "46": {
372
+ "content": "<|reserved_27|>",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "47": {
380
+ "content": "<|reserved_28|>",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "48": {
388
+ "content": "<|reserved_29|>",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "49": {
396
+ "content": "<|reserved_30|>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "50": {
404
+ "content": "<|reserved_31|>",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "51": {
412
+ "content": "<|reserved_32|>",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "52": {
420
+ "content": "<|reserved_33|>",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "53": {
428
+ "content": "<|reserved_34|>",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "54": {
436
+ "content": "<|reserved_35|>",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "55": {
444
+ "content": "<|reserved_36|>",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "56": {
452
+ "content": "<|reserved_37|>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "57": {
460
+ "content": "<|reserved_38|>",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "58": {
468
+ "content": "<|reserved_39|>",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "59": {
476
+ "content": "<|reserved_40|>",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "60": {
484
+ "content": "<|reserved_41|>",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "61": {
492
+ "content": "<|reserved_42|>",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "62": {
500
+ "content": "<|reserved_43|>",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "63": {
508
+ "content": "<|reserved_44|>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ }
515
+ },
516
+ "bos_token": "<|endoftext|>",
517
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
518
+ "clean_up_tokenization_spaces": false,
519
+ "eos_token": "<|im_end|>",
520
+ "extra_special_tokens": {},
521
+ "model_max_length": 1000000000000000019884624838656,
522
+ "pad_token": "<|pad|>",
523
+ "tokenizer_class": "PreTrainedTokenizerFast",
524
+ "unk_token": "<|unk|>"
525
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff