{"output_dir": "/data2/assaf/mamba/outputs/models", "cache_dir": "/data2/hf_cache", "activate_logging": true, "wandb_dir": "/data2/assaf/wandb/mamba", "run_name_addon": "", "record_debug_params": false, "recover_step": null, "eval_mode": false, "mamba_arch": "deci", "model_type": "mamba-130m", "use_finetuned_model": false, "load_cp": null, "clip_grad": true, "clip_grad_max_norm": 1, "seed": 123, "lr_sched_type": "const", "save_steps": 100, "eval_steps": 10, "grad_flow_steps": 10, "max_step": 20000, "epochs": 10, "model_device": "cuda:5", "dataset": "niah_custom", "train_set_size": 6144, "eval_set_size": 20, "eval_samples_to_log": 10, "log_eval_predictions_steps": 10, "eval_max_len": 10, "max_train_input_len": 20000, "niah_train_set_size": 6144, "niah_context_len_train": 2000, "niah_needle_depths_eval": [0, 0.25, 0.5, 0.75, 1], "niah_context_lens_eval": [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000], "ppl_test_context_len_train": 2000, "ppl_test_num_windows_per_context_len_eval": 10, "ppl_test_context_lens_eval": [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000], "ppl_test_pred_len": 100, "multidoc_num_noise_docs_train": 11, "multidoc_num_noise_docs_eval": [0, 5, 10, 20, 40, 80, 120, 160, 200], "multidoc_noise_injection_policy": "random_loc", "activate_decimation": true, "decimation_type": "max_p", "decimation_beta": 0.5, "decimating_layers": [12], "decimation_min_seq_len": 20, "decimation_max_p_L_base": 2000, "find_deci_layer": false, "lr": 0.0001, "weight_decay": 0.1, "grad_accum_steps": 32, "deci_num_chunks": 1} |