diff --git "a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/model.mil" "b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/model.mil"
--- "a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/model.mil"
+++ "b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/model.mil"
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(464735296))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(464735424))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(464735552))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -22,329 +22,337 @@ program(1.3)
             int32 var_31 = const()[name = string("op_31"), val = int32(-2)];
             bool var_32 = const()[name = string("op_32"), val = bool(true)];
             tensor<int32, [1]> var_50_axes_0 = const()[name = string("op_50_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_50_cast_fp16 = squeeze(axes = var_50_axes_0, x = x)[name = string("op_50_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_50_cast_fp16 = squeeze(axes = var_50_axes_0, x = x)[name = string("op_50_cast_fp16")];
             bool var_52_interleave_0 = const()[name = string("op_52_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_52_cast_fp16 = concat(axis = var_28, interleave = var_52_interleave_0, values = (var_50_cast_fp16, eps_chan_1_to_fp16))[name = string("op_52_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_52_cast_fp16 = concat(axis = var_28, interleave = var_52_interleave_0, values = (var_50_cast_fp16, eps_chan_1_to_fp16))[name = string("op_52_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_52_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_52_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_32, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_32, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_57_to_fp16 = const()[name = string("op_57_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_57_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_57_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202379008)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_71 = const()[name = string("op_71"), val = tensor<int32, [2]>([1, 1])];
-            string var_73_pad_type_0 = const()[name = string("op_73_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_73_pad_0 = const()[name = string("op_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_73_cast_fp16 = conv(dilations = var_71, groups = var_28, pad = var_73_pad_0, pad_type = var_73_pad_type_0, strides = var_69, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_73_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_70 = const()[name = string("op_70"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
+            string var_74_pad_type_0 = const()[name = string("op_74_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_74_pad_0 = const()[name = string("op_74_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_74_cast_fp16 = conv(dilations = var_72, groups = var_28, pad = var_74_pad_0, pad_type = var_74_pad_type_0, strides = var_70, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_74_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202387264)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_73_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_79 = const()[name = string("op_79"), val = tensor<int32, [2]>([1, 1])];
-            string var_81_pad_type_0 = const()[name = string("op_81_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_81_pad_0 = const()[name = string("op_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_81_cast_fp16 = conv(dilations = var_79, groups = var_28, pad = var_81_pad_0, pad_type = var_81_pad_type_0, strides = var_77, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_81_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_74_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_78 = const()[name = string("op_78"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
+            string var_82_pad_type_0 = const()[name = string("op_82_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_82_pad_0 = const()[name = string("op_82_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_82_cast_fp16 = conv(dilations = var_80, groups = var_28, pad = var_82_pad_0, pad_type = var_82_pad_type_0, strides = var_78, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_82_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202395520)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_81_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_87 = const()[name = string("op_87"), val = tensor<int32, [2]>([1, 1])];
-            string var_89_pad_type_0 = const()[name = string("op_89_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_89_pad_0 = const()[name = string("op_89_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_89_cast_fp16 = conv(dilations = var_87, groups = var_28, pad = var_89_pad_0, pad_type = var_89_pad_type_0, strides = var_85, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_89_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_82_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_86 = const()[name = string("op_86"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
+            string var_90_pad_type_0 = const()[name = string("op_90_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_90_pad_0 = const()[name = string("op_90_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_90_cast_fp16 = conv(dilations = var_88, groups = var_28, pad = var_90_pad_0, pad_type = var_90_pad_type_0, strides = var_86, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_90_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202403776)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_89_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_91, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_93, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_95, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_107_begin_0 = const()[name = string("op_107_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_107_end_0 = const()[name = string("op_107_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_107_end_mask_0 = const()[name = string("op_107_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_107_cast_fp16 = slice_by_index(begin = var_107_begin_0, end = var_107_end_0, end_mask = var_107_end_mask_0, x = q_3_cast_fp16)[name = string("op_107_cast_fp16")];
-            tensor<int32, [4]> var_113_begin_0 = const()[name = string("op_113_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_113_end_0 = const()[name = string("op_113_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_113_end_mask_0 = const()[name = string("op_113_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_113_cast_fp16 = slice_by_index(begin = var_113_begin_0, end = var_113_end_0, end_mask = var_113_end_mask_0, x = q_3_cast_fp16)[name = string("op_113_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_90_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_92, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_94, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_96, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_108_begin_0 = const()[name = string("op_108_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_108_end_0 = const()[name = string("op_108_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_108_end_mask_0 = const()[name = string("op_108_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_108_cast_fp16 = slice_by_index(begin = var_108_begin_0, end = var_108_end_0, end_mask = var_108_end_mask_0, x = q_3_cast_fp16)[name = string("op_108_cast_fp16")];
+            tensor<int32, [4]> var_114_begin_0 = const()[name = string("op_114_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_114_end_0 = const()[name = string("op_114_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_114_end_mask_0 = const()[name = string("op_114_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_114_cast_fp16 = slice_by_index(begin = var_114_begin_0, end = var_114_end_0, end_mask = var_114_end_mask_0, x = q_3_cast_fp16)[name = string("op_114_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_115_cast_fp16 = mul(x = var_113_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_115_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_116_cast_fp16 = mul(x = var_114_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_116_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_31, interleave = rotated_1_interleave_0, values = (var_115_cast_fp16, var_107_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_118_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_118_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_119_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_119_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_118_cast_fp16, y = var_119_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_132_begin_0 = const()[name = string("op_132_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_132_end_0 = const()[name = string("op_132_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_132_end_mask_0 = const()[name = string("op_132_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_132_cast_fp16 = slice_by_index(begin = var_132_begin_0, end = var_132_end_0, end_mask = var_132_end_mask_0, x = k_3_cast_fp16)[name = string("op_132_cast_fp16")];
-            tensor<int32, [4]> var_138_begin_0 = const()[name = string("op_138_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_138_end_0 = const()[name = string("op_138_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_138_end_mask_0 = const()[name = string("op_138_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_138_cast_fp16 = slice_by_index(begin = var_138_begin_0, end = var_138_end_0, end_mask = var_138_end_mask_0, x = k_3_cast_fp16)[name = string("op_138_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_31, interleave = rotated_1_interleave_0, values = (var_116_cast_fp16, var_108_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_119_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_119_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_120_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_120_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_119_cast_fp16, y = var_120_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_133_begin_0 = const()[name = string("op_133_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_133_end_0 = const()[name = string("op_133_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_133_end_mask_0 = const()[name = string("op_133_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_133_cast_fp16 = slice_by_index(begin = var_133_begin_0, end = var_133_end_0, end_mask = var_133_end_mask_0, x = k_3_cast_fp16)[name = string("op_133_cast_fp16")];
+            tensor<int32, [4]> var_139_begin_0 = const()[name = string("op_139_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_139_end_0 = const()[name = string("op_139_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_139_end_mask_0 = const()[name = string("op_139_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_139_cast_fp16 = slice_by_index(begin = var_139_begin_0, end = var_139_end_0, end_mask = var_139_end_mask_0, x = k_3_cast_fp16)[name = string("op_139_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_140_cast_fp16 = mul(x = var_138_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_140_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_141_cast_fp16 = mul(x = var_139_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_141_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_31, interleave = rotated_3_interleave_0, values = (var_140_cast_fp16, var_132_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_143_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_143_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_144_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_144_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_143_cast_fp16, y = var_144_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_31, interleave = rotated_3_interleave_0, values = (var_141_cast_fp16, var_133_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_144_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_144_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_145_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_145_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_144_cast_fp16, y = var_145_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_19, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_19, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_151_begin_0 = const()[name = string("op_151_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_151_end_0 = const()[name = string("op_151_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_151_end_mask_0 = const()[name = string("op_151_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_151_begin_0, end = var_151_end_0, end_mask = var_151_end_mask_0, x = k_7_cast_fp16)[name = string("op_151_cast_fp16")];
-            tensor<int32, [4]> var_152_begin_0 = const()[name = string("op_152_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_152_end_0 = const()[name = string("op_152_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_152_end_mask_0 = const()[name = string("op_152_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_152_begin_0, end = var_152_end_0, end_mask = var_152_end_mask_0, x = v_5_cast_fp16)[name = string("op_152_cast_fp16")];
-            fp16 var_156_to_fp16 = const()[name = string("op_156_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_157_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_156_to_fp16)[name = string("op_157_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_31, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_7_cast_fp16)[name = string("op_156_cast_fp16")];
+            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_7_cast_fp16)[name = string("op_157_cast_fp16")];
+            fp16 var_162_to_fp16 = const()[name = string("op_162_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_163_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_162_to_fp16)[name = string("op_163_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_157_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_165_cast_fp16 = softmax(axis = var_27, x = attn_weights_3_cast_fp16)[name = string("op_165_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_165_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_169 = const()[name = string("op_169"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_169, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_173 = const()[name = string("op_173"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_175 = const()[name = string("op_175"), val = tensor<int32, [2]>([1, 1])];
-            string var_177_pad_type_0 = const()[name = string("op_177_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_177_pad_0 = const()[name = string("op_177_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_177_cast_fp16 = conv(dilations = var_175, groups = var_28, pad = var_177_pad_0, pad_type = var_177_pad_type_0, strides = var_173, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_177_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_163_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_27, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_172_transpose_x_0 = const()[name = string("op_172_transpose_x_0"), val = bool(false)];
+            bool var_172_transpose_y_0 = const()[name = string("op_172_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_172_cast_fp16 = matmul(transpose_x = var_172_transpose_x_0, transpose_y = var_172_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_172_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_175 = const()[name = string("op_175"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_172_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_175, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_179 = const()[name = string("op_179"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_181 = const()[name = string("op_181"), val = tensor<int32, [2]>([1, 1])];
+            string var_183_pad_type_0 = const()[name = string("op_183_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_183_pad_0 = const()[name = string("op_183_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_183_cast_fp16 = conv(dilations = var_181, groups = var_28, pad = var_183_pad_0, pad_type = var_183_pad_type_0, strides = var_179, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_183_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202412032)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_177_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_196_axes_0 = const()[name = string("op_196_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_196_cast_fp16 = squeeze(axes = var_196_axes_0, x = x_11_cast_fp16)[name = string("op_196_cast_fp16")];
-            bool var_198_interleave_0 = const()[name = string("op_198_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_198_cast_fp16 = concat(axis = var_28, interleave = var_198_interleave_0, values = (var_196_cast_fp16, eps_chan_3_to_fp16))[name = string("op_198_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_183_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_202_axes_0 = const()[name = string("op_202_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_202_cast_fp16 = squeeze(axes = var_202_axes_0, x = x_11_cast_fp16)[name = string("op_202_cast_fp16")];
+            bool var_204_interleave_0 = const()[name = string("op_204_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_204_cast_fp16 = concat(axis = var_28, interleave = var_204_interleave_0, values = (var_202_cast_fp16, eps_chan_3_to_fp16))[name = string("op_204_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_198_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_204_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_32, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_203_to_fp16 = const()[name = string("op_203_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_203_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_32, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_209_to_fp16 = const()[name = string("op_209_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_209_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202420288)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_215 = const()[name = string("op_215"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_217 = const()[name = string("op_217"), val = tensor<int32, [2]>([1, 1])];
-            string var_219_pad_type_0 = const()[name = string("op_219_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_219_pad_0 = const()[name = string("op_219_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_219_cast_fp16 = conv(dilations = var_217, groups = var_28, pad = var_219_pad_0, pad_type = var_219_pad_type_0, strides = var_215, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_219_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202428544)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_219_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_221 = const()[name = string("op_221"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_223 = const()[name = string("op_223"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_225 = const()[name = string("op_225"), val = tensor<int32, [2]>([1, 1])];
-            string var_227_pad_type_0 = const()[name = string("op_227_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_227_pad_0 = const()[name = string("op_227_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_227_cast_fp16 = conv(dilations = var_225, groups = var_28, pad = var_227_pad_0, pad_type = var_227_pad_type_0, strides = var_223, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_227_cast_fp16")];
+            string var_225_pad_type_0 = const()[name = string("op_225_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_225_pad_0 = const()[name = string("op_225_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_225_cast_fp16 = conv(dilations = var_223, groups = var_28, pad = var_225_pad_0, pad_type = var_225_pad_type_0, strides = var_221, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_225_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202428544)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_225_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_229 = const()[name = string("op_229"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_231 = const()[name = string("op_231"), val = tensor<int32, [2]>([1, 1])];
+            string var_233_pad_type_0 = const()[name = string("op_233_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_233_pad_0 = const()[name = string("op_233_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_233_cast_fp16 = conv(dilations = var_231, groups = var_28, pad = var_233_pad_0, pad_type = var_233_pad_type_0, strides = var_229, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_233_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202450624)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_227_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_229_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_229_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_229_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_233 = const()[name = string("op_233"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_235 = const()[name = string("op_235"), val = tensor<int32, [2]>([1, 1])];
-            string var_237_pad_type_0 = const()[name = string("op_237_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_237_pad_0 = const()[name = string("op_237_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_237_cast_fp16 = conv(dilations = var_235, groups = var_28, pad = var_237_pad_0, pad_type = var_237_pad_type_0, strides = var_233, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_237_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_233_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_235_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_235_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_235_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_239 = const()[name = string("op_239"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_241 = const()[name = string("op_241"), val = tensor<int32, [2]>([1, 1])];
+            string var_243_pad_type_0 = const()[name = string("op_243_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_243_pad_0 = const()[name = string("op_243_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_243_cast_fp16 = conv(dilations = var_241, groups = var_28, pad = var_243_pad_0, pad_type = var_243_pad_type_0, strides = var_239, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_243_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202472704)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_238_cast_fp16 = mul(x = var_237_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_238_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_238_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_249 = const()[name = string("op_249"), val = int32(-1)];
-            int32 var_257 = const()[name = string("op_257"), val = int32(3)];
-            int32 var_258 = const()[name = string("op_258"), val = int32(1)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(-2)];
-            bool var_262 = const()[name = string("op_262"), val = bool(true)];
-            tensor<int32, [1]> var_279_axes_0 = const()[name = string("op_279_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_279_cast_fp16 = squeeze(axes = var_279_axes_0, x = x_15_cast_fp16)[name = string("op_279_cast_fp16")];
-            bool var_281_interleave_0 = const()[name = string("op_281_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_281_cast_fp16 = concat(axis = var_258, interleave = var_281_interleave_0, values = (var_279_cast_fp16, eps_chan_5_to_fp16))[name = string("op_281_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_244_cast_fp16 = mul(x = var_243_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_244_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_244_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_255 = const()[name = string("op_255"), val = int32(-1)];
+            int32 var_263 = const()[name = string("op_263"), val = int32(3)];
+            int32 var_264 = const()[name = string("op_264"), val = int32(1)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(-2)];
+            bool var_268 = const()[name = string("op_268"), val = bool(true)];
+            tensor<int32, [1]> var_285_axes_0 = const()[name = string("op_285_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_285_cast_fp16 = squeeze(axes = var_285_axes_0, x = x_15_cast_fp16)[name = string("op_285_cast_fp16")];
+            bool var_287_interleave_0 = const()[name = string("op_287_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_287_cast_fp16 = concat(axis = var_264, interleave = var_287_interleave_0, values = (var_285_cast_fp16, eps_chan_5_to_fp16))[name = string("op_287_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_281_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_287_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_262, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_286_to_fp16 = const()[name = string("op_286_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_286_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_268, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_292_to_fp16 = const()[name = string("op_292_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_292_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202480960)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_301 = const()[name = string("op_301"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_303 = const()[name = string("op_303"), val = tensor<int32, [2]>([1, 1])];
-            string var_305_pad_type_0 = const()[name = string("op_305_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_305_pad_0 = const()[name = string("op_305_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_305_cast_fp16 = conv(dilations = var_303, groups = var_258, pad = var_305_pad_0, pad_type = var_305_pad_type_0, strides = var_301, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_305_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_310 = const()[name = string("op_310"), val = tensor<int32, [2]>([1, 1])];
+            string var_312_pad_type_0 = const()[name = string("op_312_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_312_pad_0 = const()[name = string("op_312_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_312_cast_fp16 = conv(dilations = var_310, groups = var_264, pad = var_312_pad_0, pad_type = var_312_pad_type_0, strides = var_308, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_312_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202489216)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_305_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
-            string var_313_pad_type_0 = const()[name = string("op_313_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_313_pad_0 = const()[name = string("op_313_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_313_cast_fp16 = conv(dilations = var_311, groups = var_258, pad = var_313_pad_0, pad_type = var_313_pad_type_0, strides = var_309, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_313_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_312_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_318 = const()[name = string("op_318"), val = tensor<int32, [2]>([1, 1])];
+            string var_320_pad_type_0 = const()[name = string("op_320_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_320_pad_0 = const()[name = string("op_320_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_320_cast_fp16 = conv(dilations = var_318, groups = var_264, pad = var_320_pad_0, pad_type = var_320_pad_type_0, strides = var_316, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_320_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202497472)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_313_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
-            string var_321_pad_type_0 = const()[name = string("op_321_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_321_pad_0 = const()[name = string("op_321_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_321_cast_fp16 = conv(dilations = var_319, groups = var_258, pad = var_321_pad_0, pad_type = var_321_pad_type_0, strides = var_317, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_321_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_320_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_326 = const()[name = string("op_326"), val = tensor<int32, [2]>([1, 1])];
+            string var_328_pad_type_0 = const()[name = string("op_328_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_328_pad_0 = const()[name = string("op_328_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_328_cast_fp16 = conv(dilations = var_326, groups = var_264, pad = var_328_pad_0, pad_type = var_328_pad_type_0, strides = var_324, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_328_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202505728)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_321_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_323 = const()[name = string("op_323"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_323, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_325 = const()[name = string("op_325"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_325, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_327 = const()[name = string("op_327"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_327, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_339_begin_0 = const()[name = string("op_339_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_339_end_0 = const()[name = string("op_339_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_339_end_mask_0 = const()[name = string("op_339_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_339_cast_fp16 = slice_by_index(begin = var_339_begin_0, end = var_339_end_0, end_mask = var_339_end_mask_0, x = q_9_cast_fp16)[name = string("op_339_cast_fp16")];
-            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_328_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_330, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_332, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_334 = const()[name = string("op_334"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_334, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_346_begin_0 = const()[name = string("op_346_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_346_end_0 = const()[name = string("op_346_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_346_end_mask_0 = const()[name = string("op_346_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_346_cast_fp16 = slice_by_index(begin = var_346_begin_0, end = var_346_end_0, end_mask = var_346_end_mask_0, x = q_9_cast_fp16)[name = string("op_346_cast_fp16")];
+            tensor<int32, [4]> var_352_begin_0 = const()[name = string("op_352_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_352_end_0 = const()[name = string("op_352_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_352_end_mask_0 = const()[name = string("op_352_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_352_cast_fp16 = slice_by_index(begin = var_352_begin_0, end = var_352_end_0, end_mask = var_352_end_mask_0, x = q_9_cast_fp16)[name = string("op_352_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_347_cast_fp16 = mul(x = var_345_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_347_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_354_cast_fp16 = mul(x = var_352_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_354_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_261, interleave = rotated_5_interleave_0, values = (var_347_cast_fp16, var_339_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_350_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_350_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_351_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_351_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_350_cast_fp16, y = var_351_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_364_begin_0 = const()[name = string("op_364_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_364_end_0 = const()[name = string("op_364_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_364_end_mask_0 = const()[name = string("op_364_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_364_cast_fp16 = slice_by_index(begin = var_364_begin_0, end = var_364_end_0, end_mask = var_364_end_mask_0, x = k_11_cast_fp16)[name = string("op_364_cast_fp16")];
-            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_11_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_267, interleave = rotated_5_interleave_0, values = (var_354_cast_fp16, var_346_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_357_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_358_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_358_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_357_cast_fp16, y = var_358_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_371_begin_0 = const()[name = string("op_371_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_371_end_0 = const()[name = string("op_371_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_371_end_mask_0 = const()[name = string("op_371_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_371_cast_fp16 = slice_by_index(begin = var_371_begin_0, end = var_371_end_0, end_mask = var_371_end_mask_0, x = k_11_cast_fp16)[name = string("op_371_cast_fp16")];
+            tensor<int32, [4]> var_377_begin_0 = const()[name = string("op_377_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_377_end_0 = const()[name = string("op_377_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_377_end_mask_0 = const()[name = string("op_377_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_377_cast_fp16 = slice_by_index(begin = var_377_begin_0, end = var_377_end_0, end_mask = var_377_end_mask_0, x = k_11_cast_fp16)[name = string("op_377_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_372_cast_fp16 = mul(x = var_370_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_372_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_379_cast_fp16 = mul(x = var_377_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_379_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_261, interleave = rotated_interleave_0, values = (var_372_cast_fp16, var_364_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_375_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_375_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_376_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_376_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_375_cast_fp16, y = var_376_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_267, interleave = rotated_interleave_0, values = (var_379_cast_fp16, var_371_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_382_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_383_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_383_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_382_cast_fp16, y = var_383_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_249, interleave = k_interleave_0, values = (k_cache_1, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_255, interleave = k_interleave_0, values = (k_cache_1, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_249, interleave = v_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_383_begin_0 = const()[name = string("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_383_end_0 = const()[name = string("op_383_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_383_end_mask_0 = const()[name = string("op_383_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = k_cast_fp16)[name = string("op_383_cast_fp16")];
-            tensor<int32, [4]> var_384_begin_0 = const()[name = string("op_384_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_384_end_0 = const()[name = string("op_384_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_384_end_mask_0 = const()[name = string("op_384_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_384_begin_0, end = var_384_end_0, end_mask = var_384_end_mask_0, x = v_cast_fp16)[name = string("op_384_cast_fp16")];
-            fp16 var_388_to_fp16 = const()[name = string("op_388_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_389_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_388_to_fp16)[name = string("op_389_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_389_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_397_cast_fp16 = softmax(axis = var_257, x = attn_weights_cast_fp16)[name = string("op_397_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_397_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_401 = const()[name = string("op_401"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_401, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_405 = const()[name = string("op_405"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_407 = const()[name = string("op_407"), val = tensor<int32, [2]>([1, 1])];
-            string var_409_pad_type_0 = const()[name = string("op_409_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_409_pad_0 = const()[name = string("op_409_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_409_cast_fp16 = conv(dilations = var_407, groups = var_258, pad = var_409_pad_0, pad_type = var_409_pad_type_0, strides = var_405, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_409_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_267, interleave = v_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_394_begin_0 = const()[name = string("op_394_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_394_end_0 = const()[name = string("op_394_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_394_end_mask_0 = const()[name = string("op_394_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_394_begin_0, end = var_394_end_0, end_mask = var_394_end_mask_0, x = k_cast_fp16)[name = string("op_394_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = string("op_395_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = string("op_395_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = string("op_395_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_cast_fp16)[name = string("op_395_cast_fp16")];
+            fp16 var_400_to_fp16 = const()[name = string("op_400_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_401_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_400_to_fp16)[name = string("op_401_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_401_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_263, x = attn_weights_9_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_410_transpose_x_0 = const()[name = string("op_410_transpose_x_0"), val = bool(false)];
+            bool var_410_transpose_y_0 = const()[name = string("op_410_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_410_cast_fp16 = matmul(transpose_x = var_410_transpose_x_0, transpose_y = var_410_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_410_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_413 = const()[name = string("op_413"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_410_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_413, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_417 = const()[name = string("op_417"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_419 = const()[name = string("op_419"), val = tensor<int32, [2]>([1, 1])];
+            string var_421_pad_type_0 = const()[name = string("op_421_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_421_pad_0 = const()[name = string("op_421_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_421_cast_fp16 = conv(dilations = var_419, groups = var_264, pad = var_421_pad_0, pad_type = var_421_pad_type_0, strides = var_417, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_421_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202513984)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_409_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_428_axes_0 = const()[name = string("op_428_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_428_cast_fp16 = squeeze(axes = var_428_axes_0, x = x_25_cast_fp16)[name = string("op_428_cast_fp16")];
-            bool var_430_interleave_0 = const()[name = string("op_430_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_430_cast_fp16 = concat(axis = var_258, interleave = var_430_interleave_0, values = (var_428_cast_fp16, eps_chan_7_to_fp16))[name = string("op_430_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_421_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_440_axes_0 = const()[name = string("op_440_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_440_cast_fp16 = squeeze(axes = var_440_axes_0, x = x_25_cast_fp16)[name = string("op_440_cast_fp16")];
+            bool var_442_interleave_0 = const()[name = string("op_442_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_442_cast_fp16 = concat(axis = var_264, interleave = var_442_interleave_0, values = (var_440_cast_fp16, eps_chan_7_to_fp16))[name = string("op_442_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_430_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_442_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_262, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_435_to_fp16 = const()[name = string("op_435_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_435_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_268, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_447_to_fp16 = const()[name = string("op_447_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_447_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202522240)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_447 = const()[name = string("op_447"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_449 = const()[name = string("op_449"), val = tensor<int32, [2]>([1, 1])];
-            string var_451_pad_type_0 = const()[name = string("op_451_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_451_pad_0 = const()[name = string("op_451_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_451_cast_fp16 = conv(dilations = var_449, groups = var_258, pad = var_451_pad_0, pad_type = var_451_pad_type_0, strides = var_447, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_451_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_459 = const()[name = string("op_459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_461 = const()[name = string("op_461"), val = tensor<int32, [2]>([1, 1])];
+            string var_463_pad_type_0 = const()[name = string("op_463_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_463_pad_0 = const()[name = string("op_463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_463_cast_fp16 = conv(dilations = var_461, groups = var_264, pad = var_463_pad_0, pad_type = var_463_pad_type_0, strides = var_459, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_463_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202530496)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_451_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_455 = const()[name = string("op_455"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_457 = const()[name = string("op_457"), val = tensor<int32, [2]>([1, 1])];
-            string var_459_pad_type_0 = const()[name = string("op_459_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_459_pad_0 = const()[name = string("op_459_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_459_cast_fp16 = conv(dilations = var_457, groups = var_258, pad = var_459_pad_0, pad_type = var_459_pad_type_0, strides = var_455, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_459_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202552576)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_459_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_461_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_461_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_461_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
-            tensor<int32, [2]> var_465 = const()[name = string("op_465"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_463_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_467 = const()[name = string("op_467"), val = tensor<int32, [2]>([1, 1])];
-            string var_469_pad_type_0 = const()[name = string("op_469_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_469_pad_0 = const()[name = string("op_469_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_469_cast_fp16 = conv(dilations = var_467, groups = var_258, pad = var_469_pad_0, pad_type = var_469_pad_type_0, strides = var_465, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_469_cast_fp16")];
+            tensor<int32, [2]> var_469 = const()[name = string("op_469"), val = tensor<int32, [2]>([1, 1])];
+            string var_471_pad_type_0 = const()[name = string("op_471_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_471_pad_0 = const()[name = string("op_471_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_471_cast_fp16 = conv(dilations = var_469, groups = var_264, pad = var_471_pad_0, pad_type = var_471_pad_type_0, strides = var_467, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_471_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202552576)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_471_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_473_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_473_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_473_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_477 = const()[name = string("op_477"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_479 = const()[name = string("op_479"), val = tensor<int32, [2]>([1, 1])];
+            string var_481_pad_type_0 = const()[name = string("op_481_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_481_pad_0 = const()[name = string("op_481_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_481_cast_fp16 = conv(dilations = var_479, groups = var_264, pad = var_481_pad_0, pad_type = var_481_pad_type_0, strides = var_477, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_481_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202574656)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_470_cast_fp16 = mul(x = var_469_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_470_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_470_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_476 = const()[name = string("op_476"), val = int32(-1)];
-            int32 var_485 = const()[name = string("op_485"), val = int32(1)];
-            bool var_489 = const()[name = string("op_489"), val = bool(true)];
-            tensor<int32, [1]> var_505_axes_0 = const()[name = string("op_505_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_505_cast_fp16 = squeeze(axes = var_505_axes_0, x = x_29_cast_fp16)[name = string("op_505_cast_fp16")];
-            bool var_507_interleave_0 = const()[name = string("op_507_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_507_cast_fp16 = concat(axis = var_485, interleave = var_507_interleave_0, values = (var_505_cast_fp16, eps_chan_to_fp16))[name = string("op_507_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_482_cast_fp16 = mul(x = var_481_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_482_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_482_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_488 = const()[name = string("op_488"), val = int32(-1)];
+            int32 var_497 = const()[name = string("op_497"), val = int32(1)];
+            bool var_501 = const()[name = string("op_501"), val = bool(true)];
+            tensor<int32, [1]> var_517_axes_0 = const()[name = string("op_517_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_517_cast_fp16 = squeeze(axes = var_517_axes_0, x = x_29_cast_fp16)[name = string("op_517_cast_fp16")];
+            bool var_519_interleave_0 = const()[name = string("op_519_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_519_cast_fp16 = concat(axis = var_497, interleave = var_519_interleave_0, values = (var_517_cast_fp16, eps_chan_to_fp16))[name = string("op_519_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_507_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_519_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_489, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_512_to_fp16 = const()[name = string("op_512_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_512_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_501, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_524_to_fp16 = const()[name = string("op_524_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_524_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> post_block_ln_f_weight_to_fp16 = const()[name = string("post_block_ln_f_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202582912)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = post_block_ln_f_weight_to_fp16)[name = string("x_cast_fp16")];
-            tensor<int32, [1]> var_516_axes_0 = const()[name = string("op_516_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp16, [1, 4096, 1]> var_516_cast_fp16 = squeeze(axes = var_516_axes_0, x = x_cast_fp16)[name = string("op_516_cast_fp16")];
-            tensor<int32, [3]> var_517_perm_0 = const()[name = string("op_517_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [2]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [2]>([1, 4096])];
-            tensor<fp16, [1, 1, 4096]> var_517_cast_fp16 = transpose(perm = var_517_perm_0, x = var_516_cast_fp16)[name = string("transpose_4")];
-            tensor<fp16, [1, 4096]> reshape_0_cast_fp16 = reshape(shape = concat_4, x = var_517_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = post_block_ln_f_weight_to_fp16)[name = string("x_cast_fp16")];
+            tensor<int32, [1]> var_528_axes_0 = const()[name = string("op_528_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 4096, 4]> var_528_cast_fp16 = squeeze(axes = var_528_axes_0, x = x_cast_fp16)[name = string("op_528_cast_fp16")];
+            tensor<int32, [3]> var_529_perm_0 = const()[name = string("op_529_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [2]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [2]>([4, 4096])];
+            tensor<fp16, [1, 4, 4096]> var_529_cast_fp16 = transpose(perm = var_529_perm_0, x = var_528_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [4, 4096]> reshape_0_cast_fp16 = reshape(shape = concat_4, x = var_529_cast_fp16)[name = string("reshape_0_cast_fp16")];
             bool matmul_0_transpose_x_0 = const()[name = string("matmul_0_transpose_x_0"), val = bool(false)];
             bool matmul_0_transpose_y_0 = const()[name = string("matmul_0_transpose_y_0"), val = bool(false)];
             tensor<fp16, [4096, 16384]> transpose_1_to_fp16 = const()[name = string("transpose_1_to_fp16"), val = tensor<fp16, [4096, 16384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202591168)))];
-            tensor<fp16, [1, 16384]> matmul_0_cast_fp16 = matmul(transpose_x = matmul_0_transpose_x_0, transpose_y = matmul_0_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_1_to_fp16)[name = string("matmul_0_cast_fp16")];
-            tensor<int32, [3]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [3]>([1, 1, 16384])];
-            tensor<fp16, [1, 1, 16384]> reshape_2_cast_fp16 = reshape(shape = concat_8, x = matmul_0_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<fp16, [4, 16384]> matmul_0_cast_fp16 = matmul(transpose_x = matmul_0_transpose_x_0, transpose_y = matmul_0_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_1_to_fp16)[name = string("matmul_0_cast_fp16")];
+            tensor<int32, [3]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [3]>([1, 4, 16384])];
+            tensor<fp16, [1, 4, 16384]> reshape_2_cast_fp16 = reshape(shape = concat_8, x = matmul_0_cast_fp16)[name = string("reshape_2_cast_fp16")];
             bool matmul_1_transpose_x_0 = const()[name = string("matmul_1_transpose_x_0"), val = bool(false)];
             bool matmul_1_transpose_y_0 = const()[name = string("matmul_1_transpose_y_0"), val = bool(false)];
             tensor<fp16, [4096, 15616]> transpose_3_to_fp16 = const()[name = string("transpose_3_to_fp16"), val = tensor<fp16, [4096, 15616]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(336808960)))];
-            tensor<fp16, [1, 15616]> matmul_1_cast_fp16 = matmul(transpose_x = matmul_1_transpose_x_0, transpose_y = matmul_1_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_3_to_fp16)[name = string("matmul_1_cast_fp16")];
-            tensor<int32, [3]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [3]>([1, 1, 15616])];
-            tensor<fp16, [1, 1, 15616]> reshape_5_cast_fp16 = reshape(shape = concat_16, x = matmul_1_cast_fp16)[name = string("reshape_5_cast_fp16")];
-            bool var_526_interleave_0 = const()[name = string("op_526_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 32000]> logits = concat(axis = var_476, interleave = var_526_interleave_0, values = (reshape_2_cast_fp16, reshape_5_cast_fp16))[name = string("op_526_cast_fp16")];
+            tensor<fp16, [4, 15616]> matmul_1_cast_fp16 = matmul(transpose_x = matmul_1_transpose_x_0, transpose_y = matmul_1_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_3_to_fp16)[name = string("matmul_1_cast_fp16")];
+            tensor<int32, [3]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [3]>([1, 4, 15616])];
+            tensor<fp16, [1, 4, 15616]> reshape_5_cast_fp16 = reshape(shape = concat_16, x = matmul_1_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            bool var_538_interleave_0 = const()[name = string("op_538_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 4, 32000]> logits = concat(axis = var_488, interleave = var_538_interleave_0, values = (reshape_2_cast_fp16, reshape_5_cast_fp16))[name = string("op_538_cast_fp16")];
         } -> (logits, new_k_cache_0, new_k_cache_1, new_v_cache_0, new_v_cache_1);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -379,86 +387,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_53_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202379008)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_65 = const()[name = string("op_65"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
-            string var_69_pad_type_0 = const()[name = string("op_69_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_69_pad_0 = const()[name = string("op_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_69_cast_fp16 = conv(dilations = var_67, groups = var_24, pad = var_69_pad_0, pad_type = var_69_pad_type_0, strides = var_65, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_69_cast_fp16")];
+            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
+            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_24, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202387264)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_69_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
-            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_24, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
+            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_24, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202395520)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
-            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_24, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
+            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_24, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202403776)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_87 = const()[name = string("op_87"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_87, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_89, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_91, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_103_begin_0 = const()[name = string("op_103_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_103_end_0 = const()[name = string("op_103_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_103_end_mask_0 = const()[name = string("op_103_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_103_cast_fp16 = slice_by_index(begin = var_103_begin_0, end = var_103_end_0, end_mask = var_103_end_mask_0, x = q_3_cast_fp16)[name = string("op_103_cast_fp16")];
-            tensor<int32, [4]> var_109_begin_0 = const()[name = string("op_109_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_109_end_0 = const()[name = string("op_109_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_109_end_mask_0 = const()[name = string("op_109_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_109_cast_fp16 = slice_by_index(begin = var_109_begin_0, end = var_109_end_0, end_mask = var_109_end_mask_0, x = q_3_cast_fp16)[name = string("op_109_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
+            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = mul(x = var_109_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_111_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_27, interleave = rotated_1_interleave_0, values = (var_111_cast_fp16, var_103_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_114_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_114_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_114_cast_fp16, y = var_115_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_128_begin_0 = const()[name = string("op_128_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_128_end_0 = const()[name = string("op_128_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_128_end_mask_0 = const()[name = string("op_128_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_128_cast_fp16 = slice_by_index(begin = var_128_begin_0, end = var_128_end_0, end_mask = var_128_end_mask_0, x = k_3_cast_fp16)[name = string("op_128_cast_fp16")];
-            tensor<int32, [4]> var_134_begin_0 = const()[name = string("op_134_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_134_end_0 = const()[name = string("op_134_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_134_end_mask_0 = const()[name = string("op_134_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_134_cast_fp16 = slice_by_index(begin = var_134_begin_0, end = var_134_end_0, end_mask = var_134_end_mask_0, x = k_3_cast_fp16)[name = string("op_134_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_27, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
+            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = mul(x = var_134_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_136_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_27, interleave = rotated_3_interleave_0, values = (var_136_cast_fp16, var_128_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_139_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_139_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_139_cast_fp16, y = var_140_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_27, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_27, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = k_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = v_3_cast_fp16)[name = string("op_156_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_27, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_145_begin_0 = const()[name = string("op_145_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_145_end_0 = const()[name = string("op_145_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_145_end_mask_0 = const()[name = string("op_145_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_145_begin_0, end = var_145_end_0, end_mask = var_145_end_mask_0, x = roped_3_cast_fp16)[name = string("op_145_cast_fp16")];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = v_5_cast_fp16)[name = string("op_146_cast_fp16")];
             fp16 var_160_to_fp16 = const()[name = string("op_160_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_161_cast_fp16 = mul(x = q_5_cast_fp16, y = var_160_to_fp16)[name = string("op_161_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_161_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_160_to_fp16)[name = string("op_161_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_161_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_161_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_169_cast_fp16 = softmax(axis = var_23, x = attn_weights_3_cast_fp16)[name = string("op_169_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_169_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_23, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_170_transpose_x_1 = const()[name = string("op_170_transpose_x_1"), val = bool(false)];
+            bool var_170_transpose_y_1 = const()[name = string("op_170_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_170_cast_fp16 = matmul(transpose_x = var_170_transpose_x_1, transpose_y = var_170_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_170_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_173 = const()[name = string("op_173"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_170_cast_fp16)[name = string("transpose_7")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_173, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_177 = const()[name = string("op_177"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_179 = const()[name = string("op_179"), val = tensor<int32, [2]>([1, 1])];
@@ -522,86 +530,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_290_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202480960)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_305 = const()[name = string("op_305"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
-            string var_309_pad_type_0 = const()[name = string("op_309_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_309_pad_0 = const()[name = string("op_309_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_309_cast_fp16 = conv(dilations = var_307, groups = var_262, pad = var_309_pad_0, pad_type = var_309_pad_type_0, strides = var_305, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_309_cast_fp16")];
+            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
+            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_262, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202489216)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_309_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
-            string var_317_pad_type_0 = const()[name = string("op_317_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_317_pad_0 = const()[name = string("op_317_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_317_cast_fp16 = conv(dilations = var_315, groups = var_262, pad = var_317_pad_0, pad_type = var_317_pad_type_0, strides = var_313, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_317_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
+            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_262, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202497472)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_317_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
-            string var_325_pad_type_0 = const()[name = string("op_325_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_325_pad_0 = const()[name = string("op_325_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_325_cast_fp16 = conv(dilations = var_323, groups = var_262, pad = var_325_pad_0, pad_type = var_325_pad_type_0, strides = var_321, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_325_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
+            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_262, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202505728)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_325_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_327 = const()[name = string("op_327"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_327, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_329, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_331, x = v_5_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_343_begin_0 = const()[name = string("op_343_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_343_end_0 = const()[name = string("op_343_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_343_end_mask_0 = const()[name = string("op_343_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_343_cast_fp16 = slice_by_index(begin = var_343_begin_0, end = var_343_end_0, end_mask = var_343_end_mask_0, x = q_9_cast_fp16)[name = string("op_343_cast_fp16")];
-            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_332, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
+            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = mul(x = var_349_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_351_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_265, interleave = rotated_5_interleave_0, values = (var_351_cast_fp16, var_343_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_354_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_354_cast_fp16, y = var_355_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_368_begin_0 = const()[name = string("op_368_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_368_end_0 = const()[name = string("op_368_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_368_end_mask_0 = const()[name = string("op_368_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_368_cast_fp16 = slice_by_index(begin = var_368_begin_0, end = var_368_end_0, end_mask = var_368_end_mask_0, x = k_9_cast_fp16)[name = string("op_368_cast_fp16")];
-            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_9_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_265, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
+            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = mul(x = var_374_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_376_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_265, interleave = rotated_interleave_0, values = (var_376_cast_fp16, var_368_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_379_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_379_cast_fp16, y = var_380_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_265, interleave = q_interleave_0, values = roped_5_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_265, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_395_begin_0 = const()[name = string("op_395_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_395_end_0 = const()[name = string("op_395_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_395_end_mask_0 = const()[name = string("op_395_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = k_cast_fp16)[name = string("op_395_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = v_cast_fp16)[name = string("op_396_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_265, interleave = rotated_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_385_begin_0 = const()[name = string("op_385_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_385_end_0 = const()[name = string("op_385_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_385_end_mask_0 = const()[name = string("op_385_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_385_begin_0, end = var_385_end_0, end_mask = var_385_end_mask_0, x = roped_cast_fp16)[name = string("op_385_cast_fp16")];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_9_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = v_cast_fp16)[name = string("op_386_cast_fp16")];
             fp16 var_400_to_fp16 = const()[name = string("op_400_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_401_cast_fp16 = mul(x = q_cast_fp16, y = var_400_to_fp16)[name = string("op_401_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_401_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_409_cast_fp16 = softmax(axis = var_261, x = attn_weights_cast_fp16)[name = string("op_409_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_cast_fp16, y = var_409_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_401_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_400_to_fp16)[name = string("op_401_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_401_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_261, x = attn_weights_9_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_410_transpose_x_1 = const()[name = string("op_410_transpose_x_1"), val = bool(false)];
+            bool var_410_transpose_y_1 = const()[name = string("op_410_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_410_cast_fp16 = matmul(transpose_x = var_410_transpose_x_1, transpose_y = var_410_transpose_y_1, x = attn_weights_cast_fp16, y = v_9_cast_fp16)[name = string("op_410_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_413 = const()[name = string("op_413"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_410_cast_fp16)[name = string("transpose_5")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_413, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_417 = const()[name = string("op_417"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_419 = const()[name = string("op_419"), val = tensor<int32, [2]>([1, 1])];
@@ -667,21 +675,21 @@ program(1.3)
             tensor<int32, [1]> var_528_axes_0 = const()[name = string("op_528_axes_0"), val = tensor<int32, [1]>([2])];
             tensor<fp16, [1, 4096, 512]> var_528_cast_fp16 = squeeze(axes = var_528_axes_0, x = x_cast_fp16)[name = string("op_528_cast_fp16")];
             tensor<int32, [3]> var_529_perm_0 = const()[name = string("op_529_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [2]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [2]>([512, 4096])];
+            tensor<int32, [2]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [2]>([512, 4096])];
             tensor<fp16, [1, 512, 4096]> var_529_cast_fp16 = transpose(perm = var_529_perm_0, x = var_528_cast_fp16)[name = string("transpose_4")];
-            tensor<fp16, [512, 4096]> reshape_0_cast_fp16 = reshape(shape = concat_4, x = var_529_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<fp16, [512, 4096]> reshape_0_cast_fp16 = reshape(shape = concat_8, x = var_529_cast_fp16)[name = string("reshape_0_cast_fp16")];
             bool matmul_0_transpose_x_0 = const()[name = string("matmul_0_transpose_x_0"), val = bool(false)];
             bool matmul_0_transpose_y_0 = const()[name = string("matmul_0_transpose_y_0"), val = bool(false)];
             tensor<fp16, [4096, 16384]> transpose_1_to_fp16 = const()[name = string("transpose_1_to_fp16"), val = tensor<fp16, [4096, 16384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202591168)))];
             tensor<fp16, [512, 16384]> matmul_0_cast_fp16 = matmul(transpose_x = matmul_0_transpose_x_0, transpose_y = matmul_0_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_1_to_fp16)[name = string("matmul_0_cast_fp16")];
-            tensor<int32, [3]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [3]>([1, 512, 16384])];
-            tensor<fp16, [1, 512, 16384]> reshape_2_cast_fp16 = reshape(shape = concat_8, x = matmul_0_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [3]> concat_12 = const()[name = string("concat_12"), val = tensor<int32, [3]>([1, 512, 16384])];
+            tensor<fp16, [1, 512, 16384]> reshape_2_cast_fp16 = reshape(shape = concat_12, x = matmul_0_cast_fp16)[name = string("reshape_2_cast_fp16")];
             bool matmul_1_transpose_x_0 = const()[name = string("matmul_1_transpose_x_0"), val = bool(false)];
             bool matmul_1_transpose_y_0 = const()[name = string("matmul_1_transpose_y_0"), val = bool(false)];
             tensor<fp16, [4096, 15616]> transpose_3_to_fp16 = const()[name = string("transpose_3_to_fp16"), val = tensor<fp16, [4096, 15616]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(336808960)))];
             tensor<fp16, [512, 15616]> matmul_1_cast_fp16 = matmul(transpose_x = matmul_1_transpose_x_0, transpose_y = matmul_1_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_3_to_fp16)[name = string("matmul_1_cast_fp16")];
-            tensor<int32, [3]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [3]>([1, 512, 15616])];
-            tensor<fp16, [1, 512, 15616]> reshape_5_cast_fp16 = reshape(shape = concat_16, x = matmul_1_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            tensor<int32, [3]> concat_20 = const()[name = string("concat_20"), val = tensor<int32, [3]>([1, 512, 15616])];
+            tensor<fp16, [1, 512, 15616]> reshape_5_cast_fp16 = reshape(shape = concat_20, x = matmul_1_cast_fp16)[name = string("reshape_5_cast_fp16")];
             bool var_538_interleave_0 = const()[name = string("op_538_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 512, 32000]> logits = concat(axis = var_488, interleave = var_538_interleave_0, values = (reshape_2_cast_fp16, reshape_5_cast_fp16))[name = string("op_538_cast_fp16")];
         } -> (logits, new_k_cache_0, new_k_cache_1, new_v_cache_0, new_v_cache_1);