|
{ |
|
"architectures": [ |
|
"Transformer" |
|
], |
|
"auto_map": { |
|
"AutoConfig": "LMConfig.LMConfig", |
|
"AutoModelForCausalLM": "model.Transformer" |
|
}, |
|
"aux_loss_alpha": 0.01, |
|
"dim": 768, |
|
"dropout": 0.0, |
|
"flash_attn": true, |
|
"hidden_dim": null, |
|
"max_seq_len": 512, |
|
"model_type": "minimind", |
|
"multiple_of": 64, |
|
"n_heads": 16, |
|
"n_kv_heads": 8, |
|
"n_layers": 16, |
|
"n_routed_experts": 4, |
|
"n_shared_experts": true, |
|
"norm_eps": 1e-05, |
|
"norm_topk_prob": true, |
|
"num_experts_per_tok": 2, |
|
"scoring_func": "softmax", |
|
"seq_aux": true, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.37.2", |
|
"use_moe": false, |
|
"vocab_size": 6400 |
|
} |
|
|