|
--- |
|
base_model: |
|
- AlSamCur123/Mistral-Small3-24B-InstructContinuedFine |
|
- trashpanda-org/MS-24B-Instruct-Mullein-v0 |
|
- huihui-ai/Mistral-Small-24B-Instruct-2501-abliterated |
|
library_name: transformers |
|
tags: |
|
- mergekit |
|
- merge |
|
|
|
--- |
|
# merge |
|
|
|
This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit). |
|
|
|
## Merge Details |
|
### Merge Method |
|
|
|
This model was merged using the [DELLA](https://arxiv.org/abs/2406.11617) merge method using [huihui-ai/Mistral-Small-24B-Instruct-2501-abliterated](https://huggingface.co./huihui-ai/Mistral-Small-24B-Instruct-2501-abliterated) as a base. |
|
|
|
### Models Merged |
|
|
|
The following models were included in the merge: |
|
* [AlSamCur123/Mistral-Small3-24B-InstructContinuedFine](https://huggingface.co./AlSamCur123/Mistral-Small3-24B-InstructContinuedFine) |
|
* [trashpanda-org/MS-24B-Instruct-Mullein-v0](https://huggingface.co./trashpanda-org/MS-24B-Instruct-Mullein-v0) |
|
|
|
### Configuration |
|
|
|
The following YAML configuration was used to produce this model: |
|
|
|
```yaml |
|
# 文件名: pcb_della_merge_12b.yaml |
|
merge_method: della # 仍然使用 DELLA,因为它是一个强大的自适应剪裁方法 |
|
base_model: huihui-ai/Mistral-Small-24B-Instruct-2501-abliterated |
|
models: |
|
- model: trashpanda-org/MS-24B-Instruct-Mullein-v0 |
|
parameters: |
|
weight: 1.0 |
|
# PCB策略:限制层影响范围 + 动态竞争平衡 |
|
layers: |
|
- layers: "8-16" |
|
parameter_name: density |
|
value: 0.4 |
|
- layers: "8-16" |
|
parameter_name: epsilon |
|
value: 0.15 |
|
- layers: "8-16" |
|
parameter_name: lambda |
|
value: 1.5 |
|
- layers: "17-24" |
|
parameter_name: density |
|
value: 0.2 |
|
variance_threshold: 0.3 |
|
|
|
- model: AlSamCur123/Mistral-Small3-24B-InstructContinuedFine |
|
parameters: |
|
weight: 1.0 |
|
# 强化指令理解层 |
|
layers: |
|
- layers: "0-12" |
|
parameter_name: density |
|
value: 0.7 |
|
- layers: "0-12" |
|
parameter_name: epsilon |
|
value: 0.05 |
|
- layers: "0-12" |
|
parameter_name: lambda |
|
value: 2.0 |
|
variance_threshold: 0.25 |
|
|
|
- model: huihui-ai/Mistral-Small-24B-Instruct-2501-abliterated |
|
parameters: |
|
weight: 1.0 |
|
# 基模型参数保护策略 |
|
density: 0.9 |
|
layers: |
|
- layers: "12-24" |
|
parameter_name: density |
|
value: 1.0 |
|
|
|
parameters: |
|
global_density: 0.55 # 全局剪裁密度(PCB平衡点) |
|
intra_balance: true |
|
variance_threshold: 0.2 |
|
epsilon_range: [0.1, 0.2] |
|
|
|
tokenizer: |
|
source: base |
|
|
|
generation_config: |
|
eos_token_id: 2 |
|
pad_token_id: 2 |
|
repetition_penalty: 1.15 |
|
top_k: 40 |
|
temperature: 0.8 |
|
|
|
# 参数压缩设置 (目标 12-13B 模型) |
|
architecture: |
|
hidden_size: 3072 # 显著降低 hidden_size (原始 5120 -> 4096 -> 3072) |
|
intermediate_size: 8256 # 相应调整 intermediate_size (比例保持不变) |
|
num_attention_heads: 24 # 相应减少 attention heads (比例保持不变) |
|
num_hidden_layers: 30 # 层数保持 30 层 (适度压缩) |
|
``` |
|
|