File size: 155,412 Bytes

program(1.3)
[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
{
    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25166464))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874176))))[name = string("blocks_0_attn_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_0_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33555264))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874304))))[name = string("blocks_0_mlp_fc_1_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_0_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56099840))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874432))))[name = string("blocks_0_mlp_fc_2_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 11008, 1, 1]> blocks_0_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78644416))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874560))))[name = string("blocks_0_mlp_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101188992))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874688))))[name = string("blocks_1_attn_q_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109577792))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874816))))[name = string("blocks_1_attn_k_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117966592))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874944))))[name = string("blocks_1_attn_v_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126355392))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875072))))[name = string("blocks_1_attn_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_1_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134744192))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875200))))[name = string("blocks_1_mlp_fc_1_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_1_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157288768))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875328))))[name = string("blocks_1_mlp_fc_2_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 11008, 1, 1]> blocks_1_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179833344))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875456))))[name = string("blocks_1_mlp_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202377920))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875584))))[name = string("blocks_2_attn_q_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210766720))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875712))))[name = string("blocks_2_attn_k_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219155520))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875840))))[name = string("blocks_2_attn_v_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227544320))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875968))))[name = string("blocks_2_attn_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(235933120))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303876096))))[name = string("blocks_2_mlp_fc_1_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258477696))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303876224))))[name = string("blocks_2_mlp_fc_2_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 11008, 1, 1]> blocks_2_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281022272))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303876352))))[name = string("blocks_2_mlp_proj_weight_palettized_cast_fp16")];
            int32 var_22 = const()[name = string("op_22"), val = int32(-1)];
            int32 var_30 = const()[name = string("op_30"), val = int32(3)];
            int32 var_31 = const()[name = string("op_31"), val = int32(1)];
            int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
            bool var_35 = const()[name = string("op_35"), val = bool(true)];
            tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
            bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
            tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
            fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
            bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
            bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
            tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
            bool var_271 = const()[name = string("op_271"), val = bool(true)];
            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
            tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
            bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
            fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
            bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
            tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
            tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
            tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
            bool var_509 = const()[name = string("op_509"), val = bool(true)];
            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
            tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
            tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
            bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
            fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
            bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
            bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
            tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
            tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
            tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
            string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
        } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
    func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777536))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25166336))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25166464))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33555136))))[name = string("blocks_0_attn_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_0_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33555264))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56099712))))[name = string("blocks_0_mlp_fc_1_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_0_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56099840))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78644288))))[name = string("blocks_0_mlp_fc_2_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 11008, 1, 1]> blocks_0_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78644416))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101188864))))[name = string("blocks_0_mlp_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101188992))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109577664))))[name = string("blocks_1_attn_q_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109577792))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117966464))))[name = string("blocks_1_attn_k_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117966592))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126355264))))[name = string("blocks_1_attn_v_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126355392))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134744064))))[name = string("blocks_1_attn_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_1_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134744192))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157288640))))[name = string("blocks_1_mlp_fc_1_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_1_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157288768))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179833216))))[name = string("blocks_1_mlp_fc_2_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 11008, 1, 1]> blocks_1_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179833344))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202377792))))[name = string("blocks_1_mlp_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202377920))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210766592))))[name = string("blocks_2_attn_q_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210766720))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219155392))))[name = string("blocks_2_attn_k_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219155520))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227544192))))[name = string("blocks_2_attn_v_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227544320))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(235932992))))[name = string("blocks_2_attn_proj_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(235933120))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258477568))))[name = string("blocks_2_mlp_fc_1_weight_palettized_cast_fp16")];
            tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258477696))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281022144))))[name = string("blocks_2_mlp_fc_2_weight_palettized_cast_fp16")];
            tensor<fp16, [4096, 11008, 1, 1]> blocks_2_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281022272))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303566720))))[name = string("blocks_2_mlp_proj_weight_palettized_cast_fp16")];
            int32 var_24 = const()[name = string("op_24"), val = int32(3)];
            int32 var_25 = const()[name = string("op_25"), val = int32(1)];
            int32 var_28 = const()[name = string("op_28"), val = int32(-2)];
            bool var_29 = const()[name = string("op_29"), val = bool(true)];
            tensor<int32, [1]> var_47_axes_0 = const()[name = string("op_47_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 512]> var_47_cast_fp16 = squeeze(axes = var_47_axes_0, x = x)[name = string("op_47_cast_fp16")];
            bool var_49_interleave_0 = const()[name = string("op_49_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 1, 512]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303566848)))];
            tensor<fp16, [1, 4097, 512]> var_49_cast_fp16 = concat(axis = var_25, interleave = var_49_interleave_0, values = (var_47_cast_fp16, eps_chan_1_to_fp16))[name = string("op_49_cast_fp16")];
            tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 512]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_49_cast_fp16)[name = string("x_eps_1_cast_fp16")];
            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 512]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_29, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
            fp16 var_54_to_fp16 = const()[name = string("op_54_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
            tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
            bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
            bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
            fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
            tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
            string var_182_pad_type_0 = const()[name = string("op_182_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_182_pad_0 = const()[name = string("op_182_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_182_cast_fp16 = conv(dilations = var_180, groups = var_25, pad = var_182_pad_0, pad_type = var_182_pad_type_0, strides = var_178, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_182_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
            tensor<fp16, [1, 4096, 1, 512]> attention_output_1_cast_fp16 = mul(x = var_182_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
            tensor<int32, [1]> var_201_axes_0 = const()[name = string("op_201_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 512]> var_201_cast_fp16 = squeeze(axes = var_201_axes_0, x = x_11_cast_fp16)[name = string("op_201_cast_fp16")];
            bool var_203_interleave_0 = const()[name = string("op_203_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 4097, 512]> var_203_cast_fp16 = concat(axis = var_25, interleave = var_203_interleave_0, values = (var_201_cast_fp16, eps_chan_1_to_fp16))[name = string("op_203_cast_fp16")];
            tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 512]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_203_cast_fp16)[name = string("x_eps_3_cast_fp16")];
            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 512]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_29, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
            fp16 var_208_to_fp16 = const()[name = string("op_208_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_208_to_fp16)[name = string("x_normed_9_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
            tensor<fp16, [1, 4096, 1, 512]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_222 = const()[name = string("op_222"), val = tensor<int32, [2]>([1, 1])];
            string var_224_pad_type_0 = const()[name = string("op_224_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_224_pad_0 = const()[name = string("op_224_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 512]> var_224_cast_fp16 = conv(dilations = var_222, groups = var_25, pad = var_224_pad_0, pad_type = var_224_pad_type_0, strides = var_220, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_224_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
            tensor<fp16, [1, 11008, 1, 512]> input_5_cast_fp16 = mul(x = var_224_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_230 = const()[name = string("op_230"), val = tensor<int32, [2]>([1, 1])];
            string var_232_pad_type_0 = const()[name = string("op_232_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_232_pad_0 = const()[name = string("op_232_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 512]> var_232_cast_fp16 = conv(dilations = var_230, groups = var_25, pad = var_232_pad_0, pad_type = var_232_pad_type_0, strides = var_228, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_232_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
            tensor<fp16, [1, 11008, 1, 512]> x_fc_2_1_cast_fp16 = mul(x = var_232_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 512]> var_234_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_234_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 512]> input_7_cast_fp16 = mul(x = var_234_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_240 = const()[name = string("op_240"), val = tensor<int32, [2]>([1, 1])];
            string var_242_pad_type_0 = const()[name = string("op_242_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_242_pad_0 = const()[name = string("op_242_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_242_cast_fp16 = conv(dilations = var_240, groups = var_25, pad = var_242_pad_0, pad_type = var_242_pad_type_0, strides = var_238, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_242_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
            tensor<fp16, [1, 4096, 1, 512]> var_243_cast_fp16 = mul(x = var_242_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_243_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_15_cast_fp16 = add(x = var_243_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
            int32 var_262 = const()[name = string("op_262"), val = int32(3)];
            int32 var_263 = const()[name = string("op_263"), val = int32(1)];
            int32 var_266 = const()[name = string("op_266"), val = int32(-2)];
            bool var_267 = const()[name = string("op_267"), val = bool(true)];
            tensor<int32, [1]> var_284_axes_0 = const()[name = string("op_284_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 512]> var_284_cast_fp16 = squeeze(axes = var_284_axes_0, x = x_15_cast_fp16)[name = string("op_284_cast_fp16")];
            bool var_286_interleave_0 = const()[name = string("op_286_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 4097, 512]> var_286_cast_fp16 = concat(axis = var_263, interleave = var_286_interleave_0, values = (var_284_cast_fp16, eps_chan_1_to_fp16))[name = string("op_286_cast_fp16")];
            tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 512]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_286_cast_fp16)[name = string("x_eps_5_cast_fp16")];
            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 512]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_267, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
            fp16 var_291_to_fp16 = const()[name = string("op_291_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
            tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
            bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
            fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
            bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
            fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
            tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
            tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
            string var_422_pad_type_0 = const()[name = string("op_422_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_422_pad_0 = const()[name = string("op_422_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_422_cast_fp16 = conv(dilations = var_420, groups = var_263, pad = var_422_pad_0, pad_type = var_422_pad_type_0, strides = var_418, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_422_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
            tensor<fp16, [1, 4096, 1, 512]> attention_output_3_cast_fp16 = mul(x = var_422_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
            tensor<int32, [1]> var_441_axes_0 = const()[name = string("op_441_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 512]> var_441_cast_fp16 = squeeze(axes = var_441_axes_0, x = x_25_cast_fp16)[name = string("op_441_cast_fp16")];
            bool var_443_interleave_0 = const()[name = string("op_443_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 4097, 512]> var_443_cast_fp16 = concat(axis = var_263, interleave = var_443_interleave_0, values = (var_441_cast_fp16, eps_chan_1_to_fp16))[name = string("op_443_cast_fp16")];
            tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 512]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_443_cast_fp16)[name = string("x_eps_7_cast_fp16")];
            tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 512]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_267, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
            fp16 var_448_to_fp16 = const()[name = string("op_448_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_448_to_fp16)[name = string("x_normed_21_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
            tensor<fp16, [1, 4096, 1, 512]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
            string var_464_pad_type_0 = const()[name = string("op_464_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_464_pad_0 = const()[name = string("op_464_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 512]> var_464_cast_fp16 = conv(dilations = var_462, groups = var_263, pad = var_464_pad_0, pad_type = var_464_pad_type_0, strides = var_460, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_464_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
            tensor<fp16, [1, 11008, 1, 512]> input_13_cast_fp16 = mul(x = var_464_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 512]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_263, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_472_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
            tensor<fp16, [1, 11008, 1, 512]> x_fc_2_3_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 512]> var_474_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_474_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 512]> input_15_cast_fp16 = mul(x = var_474_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
            tensor<int32, [2]> var_478 = const()[name = string("op_478"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
            string var_482_pad_type_0 = const()[name = string("op_482_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_482_pad_0 = const()[name = string("op_482_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_482_cast_fp16 = conv(dilations = var_480, groups = var_263, pad = var_482_pad_0, pad_type = var_482_pad_type_0, strides = var_478, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_482_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
            tensor<fp16, [1, 4096, 1, 512]> var_483_cast_fp16 = mul(x = var_482_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_483_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_29_cast_fp16 = add(x = var_483_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
            int32 var_502 = const()[name = string("op_502"), val = int32(3)];
            int32 var_503 = const()[name = string("op_503"), val = int32(1)];
            int32 var_506 = const()[name = string("op_506"), val = int32(-2)];
            bool var_507 = const()[name = string("op_507"), val = bool(true)];
            tensor<int32, [1]> var_524_axes_0 = const()[name = string("op_524_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 512]> var_524_cast_fp16 = squeeze(axes = var_524_axes_0, x = x_29_cast_fp16)[name = string("op_524_cast_fp16")];
            bool var_526_interleave_0 = const()[name = string("op_526_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 4097, 512]> var_526_cast_fp16 = concat(axis = var_503, interleave = var_526_interleave_0, values = (var_524_cast_fp16, eps_chan_1_to_fp16))[name = string("op_526_cast_fp16")];
            tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 512]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_526_cast_fp16)[name = string("x_eps_9_cast_fp16")];
            tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 512]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_507, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
            fp16 var_531_to_fp16 = const()[name = string("op_531_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
            tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
            bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
            fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
            bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
            tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_503, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
            tensor<fp16, [1, 4096, 1, 512]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4096, 512]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
            tensor<fp16, [1, 4097, 512]> var_683_cast_fp16 = concat(axis = var_503, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_1_to_fp16))[name = string("op_683_cast_fp16")];
            tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
            tensor<fp16, [1, 4097, 1, 512]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
            tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
            tensor<fp16, [1, 1, 1, 512]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_507, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
            tensor<fp16, [1, 4096, 1, 512]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
            tensor<fp16, [1, 4096, 1, 512]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
            tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
            string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 512]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_503, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
            tensor<fp16, [1, 11008, 1, 512]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 11008, 1, 512]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_503, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
            tensor<fp16, [1, 11008, 1, 512]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 512]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
            tensor<fp16, [1, 11008, 1, 512]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
            tensor<fp16, [1, 4096, 1, 512]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_503, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
            tensor<fp16, [1, 4096, 1, 512]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
            tensor<fp16, [1, 4096, 1, 512]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
        } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
}