cometkiwi-xxl-eole / config.json
vince62s's picture
Update config.json
24cfbfa verified
{
"src_vocab_size": 250880,
"report_every": 50,
"save_data": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384/",
"skip_empty_level": "silent",
"decoder_start_token": "<s>",
"seed": 1234,
"log_file": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim/48-0-32-4096-16384-with-estim.log",
"n_sample": 0,
"tgt_vocab_size": 250880,
"default_specials": [
"<unk>",
"<blank>",
"<s>",
"</s>"
],
"model": {
"rotary_theta": 10000,
"hidden_size": 4096,
"transformer_ff": 16384,
"layers": 48,
"parallel_residual": false,
"mlp_activation_fn": "gelu",
"add_ffnbias": true,
"add_qkvbias": true,
"norm_eps": 1e-05,
"heads": 32,
"embeddings": {
"n_positions": 514,
"word_vec_size": 4096,
"src_word_vec_size": 4096,
"position_shift": 2,
"freeze_word_vecs_enc": true,
"position_encoding_type": "Learned",
"tgt_word_vec_size": 4096,
},
"shared_layer_norm": false,
"num_experts_per_tok": 0,
"max_relative_positions": 0,
"heads_kv": 32,
"num_experts": 0,
"architecture": "transformer_encoder",
"sliding_window": 0,
"share_decoder_embeddings": true,
"left_pad": false,
"add_estimator": true,
"encoder": {
"encoder_type": "transformer",
"src_word_vec_size": 4096
},
"layer_norm": "standard",
"rotary_interleave": false,
"rotary_dim": 0
},
"src_vocab": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xl-eole/dict2.txt",
"vocab_size_multiple": 1,
"share_vocab": true,
"tgt_vocab": null,
"transforms": [
"sentencepiece"
],
"transforms_configs": {
"onmt_tokenize": {},
"tokendrop": {},
"bpe": {},
"filtertoolong": {
"src_seq_length": 94,
"tgt_seq_length": 94
},
"inlinetags": {},
"clean": {},
"suffix": {},
"docify": {},
"switchout": {},
"uppercase": {},
"terminology": {},
"sentencepiece": {
"tgt_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model",
"src_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model"
},
"normalize": {},
"bart": {},
"insert_mask_before_placeholder": {},
"prefix": {},
"tokenmask": {}
},
"training": {
"world_size": 1,
"w_bit": 0,
"group_size": 0,
"batch_type": "sents",
"param_init_glorot": true,
"prefetch_factor": 400,
"learning_rate_decay": 1.0,
"decay_steps": 100000,
"param_init": 0.0,
"save_checkpoint_steps": 4000,
"accum_count": [
8
],
"num_workers": 2,
"model_dtype": "fp16",
"start_decay_steps": 1000000,
"label_smoothing": 0.1,
"keep_checkpoint": 50,
"train_from": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/",
"valid_batch_size": 1,
"estim_loss_lambda_steps": [
0
],
"quant_type": "bnb_NF4",
"batch_size_multiple": 1,
"attention_dropout": [
0.0
],
"learning_rate": 1.5e-05,
"model_path": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim",
"batch_size": 8,
"dropout_steps": [
0
],
"dropout": [
0.1
],
"score_threshold": 0.0,
"gpu_ranks": [
0
],
"optim": "fusedadam",
"normalization": "tokens",
"valid_steps": 1000,
"train_steps": 4000,
"adam_beta2": 0.998,
"decay_method": "none",
"estim_loss_lambda": [
1.0
],
"average_decay": 0.0,
"accum_steps": [
0
],
"quant_layers": [
"linear_values",
"linear_query",
"linear_keys",
"final_linear",
"gate_up_proj",
"down_proj"
],
"max_grad_norm": 1.0,
"self_attn_backend": "pytorch",
"freeze_encoder": true,
"bucket_size": 262144
},
"data": {}
}