|
{ |
|
"architectures": [ |
|
"TokenFormerForCausalLM" |
|
], |
|
"num_layers": 12, |
|
"hidden_size": 768, |
|
"num_attention_heads": 12, |
|
"qkv_slot_num": 768, |
|
"proj_slot_num": 768, |
|
"ffn_slot_num": 3072, |
|
"seq_length": 2048, |
|
"max_position_embeddings": 2048, |
|
"pos_emb": "rotary", |
|
"rotary_pct": 0.25, |
|
"no_weight_tying": false, |
|
"norm": "layernorm_nonparam", |
|
"final_norm": "layernorm", |
|
"gpt_j_residual": false, |
|
"output_layer_parallelism": "column", |
|
"use_bias_in_attn_linear": false, |
|
"attention_config": [[["tokenformer"], 12]], |
|
"norm_activation_type": "l2_norm_gelu", |
|
"scaled_upper_triang_masked_softmax_fusion": false, |
|
"bias_gelu_fusion": false, |
|
"rope_fusion": false, |
|
"layernorm_fusion": false, |
|
"init_method": "normal", |
|
"output_layer_init_method": "wang_init", |
|
"use_cache": true, |
|
"torch_dtype": "float16", |
|
"transformers_version": "4.36.0" |
|
} |
|
|