{ "architectures": [ "TokenFormerForCausalLM" ], "num_layers": 12, "hidden_size": 768, "num_attention_heads": 12, "qkv_slot_num": 768, "proj_slot_num": 768, "ffn_slot_num": 3072, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "rotary_pct": 0.25, "no_weight_tying": false, "norm": "layernorm_nonparam", "final_norm": "layernorm", "gpt_j_residual": false, "output_layer_parallelism": "column", "use_bias_in_attn_linear": false, "attention_config": [[["tokenformer"], 12]], "norm_activation_type": "l2_norm_gelu", "scaled_upper_triang_masked_softmax_fusion": false, "bias_gelu_fusion": false, "rope_fusion": false, "layernorm_fusion": false, "init_method": "normal", "output_layer_init_method": "wang_init", "use_cache": true, "torch_dtype": "float16", "transformers_version": "4.36.0" }