|
{ |
|
"measurement": { |
|
"model.layers.0": { |
|
"accuracy": 0.9728512763977051, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.1": { |
|
"accuracy": 0.9760153293609619, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.2": { |
|
"accuracy": 0.9293310642242432, |
|
"total_bits": 1844214912, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.3": { |
|
"accuracy": 0.9944927841424942, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.4": { |
|
"accuracy": 0.993605300784111, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.5": { |
|
"accuracy": 0.9932107627391815, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.6": { |
|
"accuracy": 0.9913060367107391, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.7": { |
|
"accuracy": 0.9906440675258636, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.8": { |
|
"accuracy": 0.9870590567588806, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.9": { |
|
"accuracy": 0.9878467917442322, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.10": { |
|
"accuracy": 0.985314667224884, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.11": { |
|
"accuracy": 0.9831167459487915, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.12": { |
|
"accuracy": 0.9826334714889526, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.13": { |
|
"accuracy": 0.9785091876983643, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.14": { |
|
"accuracy": 0.978158712387085, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.15": { |
|
"accuracy": 0.9814843535423279, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.16": { |
|
"accuracy": 0.9805989265441895, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.17": { |
|
"accuracy": 0.9753129482269287, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.18": { |
|
"accuracy": 0.9756971597671509, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.19": { |
|
"accuracy": 0.9703136682510376, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.20": { |
|
"accuracy": 0.9715620279312134, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.21": { |
|
"accuracy": 0.9708086252212524, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.22": { |
|
"accuracy": 0.9545676708221436, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.23": { |
|
"accuracy": 0.9661253690719604, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.24": { |
|
"accuracy": 0.9636465311050415, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.25": { |
|
"accuracy": 0.9580631256103516, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.26": { |
|
"accuracy": 0.9595526456832886, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.27": { |
|
"accuracy": 0.9578239917755127, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.28": { |
|
"accuracy": 0.9548648595809937, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.29": { |
|
"accuracy": 0.9526456594467163, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.30": { |
|
"accuracy": 0.9459207057952881, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.31": { |
|
"accuracy": 0.939448356628418, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.32": { |
|
"accuracy": 0.9383738040924072, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.33": { |
|
"accuracy": 0.9380207061767578, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.34": { |
|
"accuracy": 0.933182954788208, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.35": { |
|
"accuracy": 0.9296905994415283, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.36": { |
|
"accuracy": 0.9322066307067871, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.37": { |
|
"accuracy": 0.9338107109069824, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.38": { |
|
"accuracy": 0.9316926002502441, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.39": { |
|
"accuracy": 0.9315962791442871, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.40": { |
|
"accuracy": 0.9353313446044922, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.41": { |
|
"accuracy": 0.9339642524719238, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.42": { |
|
"accuracy": 0.9359707832336426, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.43": { |
|
"accuracy": 0.9405109882354736, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.44": { |
|
"accuracy": 0.9430067539215088, |
|
"total_bits": 1454051712, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.45": { |
|
"accuracy": 0.9295551776885986, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.46": { |
|
"accuracy": 0.931776762008667, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.47": { |
|
"accuracy": 0.9356400966644287, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.48": { |
|
"accuracy": 0.9373159408569336, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.49": { |
|
"accuracy": 0.9395179748535156, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.50": { |
|
"accuracy": 0.9408411979675293, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.51": { |
|
"accuracy": 0.9418692588806152, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.52": { |
|
"accuracy": 0.944648027420044, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.53": { |
|
"accuracy": 0.9457046985626221, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.54": { |
|
"accuracy": 0.9476807117462158, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.55": { |
|
"accuracy": 0.9469304084777832, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.56": { |
|
"accuracy": 0.9432554244995117, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.57": { |
|
"accuracy": 0.9448332786560059, |
|
"total_bits": 1179493632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.58": { |
|
"accuracy": 0.9310579299926758, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.59": { |
|
"accuracy": 0.9633722305297852, |
|
"total_bits": 1165045632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
} |
|
} |
|
} |