|
{ |
|
"src_vocab_size": 250880, |
|
"report_every": 50, |
|
"save_data": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384/", |
|
"skip_empty_level": "silent", |
|
"decoder_start_token": "<s>", |
|
"seed": 1234, |
|
"log_file": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim/48-0-32-4096-16384-with-estim.log", |
|
"n_sample": 0, |
|
"tgt_vocab_size": 250880, |
|
"default_specials": [ |
|
"<unk>", |
|
"<blank>", |
|
"<s>", |
|
"</s>" |
|
], |
|
"model": { |
|
"rotary_theta": 10000, |
|
"hidden_size": 4096, |
|
"transformer_ff": 16384, |
|
"layers": 48, |
|
"parallel_residual": false, |
|
"mlp_activation_fn": "gelu", |
|
"add_ffnbias": true, |
|
"add_qkvbias": true, |
|
"norm_eps": 1e-05, |
|
"heads": 32, |
|
"embeddings": { |
|
"n_positions": 514, |
|
"word_vec_size": 4096, |
|
"src_word_vec_size": 4096, |
|
"position_shift": 2, |
|
"freeze_word_vecs_enc": true, |
|
"position_encoding_type": "Learned", |
|
"tgt_word_vec_size": 4096, |
|
}, |
|
"shared_layer_norm": false, |
|
"num_experts_per_tok": 0, |
|
"max_relative_positions": 0, |
|
"heads_kv": 32, |
|
"num_experts": 0, |
|
"architecture": "transformer_encoder", |
|
"sliding_window": 0, |
|
"share_decoder_embeddings": true, |
|
"left_pad": false, |
|
"add_estimator": true, |
|
"encoder": { |
|
"encoder_type": "transformer", |
|
"src_word_vec_size": 4096 |
|
}, |
|
"layer_norm": "standard", |
|
"rotary_interleave": false, |
|
"rotary_dim": 0 |
|
}, |
|
"src_vocab": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xl-eole/dict2.txt", |
|
"vocab_size_multiple": 1, |
|
"share_vocab": true, |
|
"tgt_vocab": null, |
|
"transforms": [ |
|
"sentencepiece" |
|
], |
|
"transforms_configs": { |
|
"onmt_tokenize": {}, |
|
"tokendrop": {}, |
|
"bpe": {}, |
|
"filtertoolong": { |
|
"src_seq_length": 94, |
|
"tgt_seq_length": 94 |
|
}, |
|
"inlinetags": {}, |
|
"clean": {}, |
|
"suffix": {}, |
|
"docify": {}, |
|
"switchout": {}, |
|
"uppercase": {}, |
|
"terminology": {}, |
|
"sentencepiece": { |
|
"tgt_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model", |
|
"src_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model" |
|
}, |
|
"normalize": {}, |
|
"bart": {}, |
|
"insert_mask_before_placeholder": {}, |
|
"prefix": {}, |
|
"tokenmask": {} |
|
}, |
|
"training": { |
|
"world_size": 1, |
|
"w_bit": 0, |
|
"group_size": 0, |
|
"batch_type": "sents", |
|
"param_init_glorot": true, |
|
"prefetch_factor": 400, |
|
"learning_rate_decay": 1.0, |
|
"decay_steps": 100000, |
|
"param_init": 0.0, |
|
"save_checkpoint_steps": 4000, |
|
"accum_count": [ |
|
8 |
|
], |
|
"num_workers": 2, |
|
"model_dtype": "fp16", |
|
"start_decay_steps": 1000000, |
|
"label_smoothing": 0.1, |
|
"keep_checkpoint": 50, |
|
"train_from": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/", |
|
"valid_batch_size": 1, |
|
"estim_loss_lambda_steps": [ |
|
0 |
|
], |
|
"quant_type": "bnb_NF4", |
|
"batch_size_multiple": 1, |
|
"attention_dropout": [ |
|
0.0 |
|
], |
|
"learning_rate": 1.5e-05, |
|
"model_path": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim", |
|
"batch_size": 8, |
|
"dropout_steps": [ |
|
0 |
|
], |
|
"dropout": [ |
|
0.1 |
|
], |
|
"score_threshold": 0.0, |
|
"gpu_ranks": [ |
|
0 |
|
], |
|
"optim": "fusedadam", |
|
"normalization": "tokens", |
|
"valid_steps": 1000, |
|
"train_steps": 4000, |
|
"adam_beta2": 0.998, |
|
"decay_method": "none", |
|
"estim_loss_lambda": [ |
|
1.0 |
|
], |
|
"average_decay": 0.0, |
|
"accum_steps": [ |
|
0 |
|
], |
|
"quant_layers": [ |
|
"linear_values", |
|
"linear_query", |
|
"linear_keys", |
|
"final_linear", |
|
"gate_up_proj", |
|
"down_proj" |
|
], |
|
"max_grad_norm": 1.0, |
|
"self_attn_backend": "pytorch", |
|
"freeze_encoder": true, |
|
"bucket_size": 262144 |
|
}, |
|
"data": {} |
|
} |