{ "src_vocab_size": 250880, "report_every": 50, "save_data": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384/", "skip_empty_level": "silent", "decoder_start_token": "", "seed": 1234, "log_file": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim/48-0-32-4096-16384-with-estim.log", "n_sample": 0, "tgt_vocab_size": 250880, "default_specials": [ "", "", "", "" ], "model": { "rotary_theta": 10000, "hidden_size": 4096, "transformer_ff": 16384, "layers": 48, "parallel_residual": false, "mlp_activation_fn": "gelu", "add_ffnbias": true, "add_qkvbias": true, "norm_eps": 1e-05, "heads": 32, "embeddings": { "n_positions": 514, "word_vec_size": 4096, "src_word_vec_size": 4096, "position_shift": 2, "freeze_word_vecs_enc": true, "position_encoding_type": "Learned", "tgt_word_vec_size": 4096, }, "shared_layer_norm": false, "num_experts_per_tok": 0, "max_relative_positions": 0, "heads_kv": 32, "num_experts": 0, "architecture": "transformer_encoder", "sliding_window": 0, "share_decoder_embeddings": true, "left_pad": false, "add_estimator": true, "encoder": { "encoder_type": "transformer", "src_word_vec_size": 4096 }, "layer_norm": "standard", "rotary_interleave": false, "rotary_dim": 0 }, "src_vocab": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xl-eole/dict2.txt", "vocab_size_multiple": 1, "share_vocab": true, "tgt_vocab": null, "transforms": [ "sentencepiece" ], "transforms_configs": { "onmt_tokenize": {}, "tokendrop": {}, "bpe": {}, "filtertoolong": { "src_seq_length": 94, "tgt_seq_length": 94 }, "inlinetags": {}, "clean": {}, "suffix": {}, "docify": {}, "switchout": {}, "uppercase": {}, "terminology": {}, "sentencepiece": { "tgt_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model", "src_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model" }, "normalize": {}, "bart": {}, "insert_mask_before_placeholder": {}, "prefix": {}, "tokenmask": {} }, "training": { "world_size": 1, "w_bit": 0, "group_size": 0, "batch_type": "sents", "param_init_glorot": true, "prefetch_factor": 400, "learning_rate_decay": 1.0, "decay_steps": 100000, "param_init": 0.0, "save_checkpoint_steps": 4000, "accum_count": [ 8 ], "num_workers": 2, "model_dtype": "fp16", "start_decay_steps": 1000000, "label_smoothing": 0.1, "keep_checkpoint": 50, "train_from": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/", "valid_batch_size": 1, "estim_loss_lambda_steps": [ 0 ], "quant_type": "bnb_NF4", "batch_size_multiple": 1, "attention_dropout": [ 0.0 ], "learning_rate": 1.5e-05, "model_path": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim", "batch_size": 8, "dropout_steps": [ 0 ], "dropout": [ 0.1 ], "score_threshold": 0.0, "gpu_ranks": [ 0 ], "optim": "fusedadam", "normalization": "tokens", "valid_steps": 1000, "train_steps": 4000, "adam_beta2": 0.998, "decay_method": "none", "estim_loss_lambda": [ 1.0 ], "average_decay": 0.0, "accum_steps": [ 0 ], "quant_layers": [ "linear_values", "linear_query", "linear_keys", "final_linear", "gate_up_proj", "down_proj" ], "max_grad_norm": 1.0, "self_attn_backend": "pytorch", "freeze_encoder": true, "bucket_size": 262144 }, "data": {} }