base_model: Qwen/QwQ-32B gate_mode: random architecture: qwen dtype: bfloat16 experts: - source_model: Qwen/QwQ-32B - source_model: Qwen/QwQ-32B - source_model: Qwen/QwQ-32B - source_model: Qwen/QwQ-32B - source_model: Qwen/QwQ-32B - source_model: Qwen/QwQ-32B - source_model: Qwen/QwQ-32B - source_model: Qwen/QwQ-32B shared_experts: - source_model: Qwen/QwQ-32B residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model