# 文件名: pcb_della_merge_12b.yaml | |
merge_method: della # 仍然使用 DELLA,因为它是一个强大的自适应剪裁方法 | |
base_model: huihui-ai/Mistral-Small-24B-Instruct-2501-abliterated | |
models: | |
- model: trashpanda-org/MS-24B-Instruct-Mullein-v0 | |
parameters: | |
weight: 1.0 | |
# PCB策略:限制层影响范围 + 动态竞争平衡 | |
layers: | |
- layers: "8-16" | |
parameter_name: density | |
value: 0.4 | |
- layers: "8-16" | |
parameter_name: epsilon | |
value: 0.15 | |
- layers: "8-16" | |
parameter_name: lambda | |
value: 1.5 | |
- layers: "17-24" | |
parameter_name: density | |
value: 0.2 | |
variance_threshold: 0.3 | |
- model: AlSamCur123/Mistral-Small3-24B-InstructContinuedFine | |
parameters: | |
weight: 1.0 | |
# 强化指令理解层 | |
layers: | |
- layers: "0-12" | |
parameter_name: density | |
value: 0.7 | |
- layers: "0-12" | |
parameter_name: epsilon | |
value: 0.05 | |
- layers: "0-12" | |
parameter_name: lambda | |
value: 2.0 | |
variance_threshold: 0.25 | |
- model: huihui-ai/Mistral-Small-24B-Instruct-2501-abliterated | |
parameters: | |
weight: 1.0 | |
# 基模型参数保护策略 | |
density: 0.9 | |
layers: | |
- layers: "12-24" | |
parameter_name: density | |
value: 1.0 | |
parameters: | |
global_density: 0.55 # 全局剪裁密度(PCB平衡点) | |
intra_balance: true | |
variance_threshold: 0.2 | |
epsilon_range: [0.1, 0.2] | |
tokenizer: | |
source: base | |
generation_config: | |
eos_token_id: 2 | |
pad_token_id: 2 | |
repetition_penalty: 1.15 | |
top_k: 40 | |
temperature: 0.8 | |
# 参数压缩设置 (目标 12-13B 模型) | |
architecture: | |
hidden_size: 3072 # 显著降低 hidden_size (原始 5120 -> 4096 -> 3072) | |
intermediate_size: 8256 # 相应调整 intermediate_size (比例保持不变) | |
num_attention_heads: 24 # 相应减少 attention heads (比例保持不变) | |
num_hidden_layers: 30 # 层数保持 30 层 (适度压缩) |