Parent PR: Add Multi-Quantization Support for DeepSeek-R1-Distill-Qwen-32B via MLX_LM This PR introduces a new conversion pipeline that generates multiple quantized variants of DeepSeek-R1-Distill-Qwen-32B using the MLX_LM tool. Unlike previous methods based on llama.cpp, this implementation leverages MLX_LM’s unique `quant_predicate` configuration to produce high‑quality mixed‑bit quantizations optimized specifically for MLX runs. Key Changes and Features: - MLX_LM-Based Conversion: All model conversions are performed using MLX_LM, which uses parameters like `q_bits`, `q_group_size`, and the distinctive `quant_predicate` (e.g., `"mixed_3_6"`, `"mixed_2_6"`) to create finely tuned quantized models. This provides a superior balance between quality and performance tailored for MLX inference. - Asynchronous Workflow: The new pipeline supports asynchronous conversion and upload tasks. Each quantized variant is generated concurrently and then uploaded to the designated Hugging Face repository, streamlining the overall process. - Updated Documentation: The repository’s README has been fully updated to reflect the MLX_LM conversion process, providing clear instructions on prompt formatting, downloading individual variants, and running the models with MLX. The documentation emphasizes that these quantizations are for MLX runs only and are not intended for general GGUF deployments. - Enhanced User Flexibility: With multiple quantization options (including bf16, Q8_0, Q6_K, Q5_K_M, Q4_K_M, IQ4_NL, etc.), users can select the variant that best meets their hardware and performance requirements. Detailed usage and download instructions facilitate easy deployment. Benefits: - Optimized for MLX Runs: The generated quantized models are designed specifically for MLX inference, ensuring optimal performance and compatibility with MLX’s specialized runtime. - Scalability and Future-Proofing: This modular pipeline allows for easy integration of additional quantization recipes and future enhancements while keeping the conversion process aligned with MLX_LM’s capabilities. - Comprehensive Documentation: The updated README and model card provide thorough guidance on model usage, including prompt format, download instructions, and hardware-specific recommendations. This PR represents a significant advancement in making DeepSeek-R1-Distill-Qwen-32B more accessible and versatile for MLX users. It establishes a parent PR that will be referenced by every subsequent quantized model upload, ensuring consistency and traceability across all releases. Feedback and suggestions are welcome.

#20

by sealad886 - opened about 16 hours ago

base: refs/heads/main

←

from: refs/pr/20

Discussion Files changed

+5933

-0

Files changed (10) hide show

.gitattributes +1 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/config.json +0 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model-00001-of-00004.safetensors +3 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model-00002-of-00004.safetensors +3 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model-00003-of-00004.safetensors +3 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model-00004-of-00004.safetensors +3 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model.safetensors.index.json +0 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/special_tokens_map.json +23 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/tokenizer.json +3 -0
DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/tokenizer_config.json +195 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 DeepSeek-R1-Distill-Qwen-32B_4bit/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 DeepSeek-R1-Distill-Qwen-32B_4bit/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/tokenizer.json filter=lfs diff=lfs merge=lfs -text

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7819f2c0378a7ea9896986fa6f2976101d6dd007bc5b22f5023a148d9a5617b6
+size 5321941889

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4062e2e1a76c96699e2bdeaab15ce1a130f14f8ff3cc2625dc450e76bd3e96
+size 5363162608

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a23c6fb10dca081039ccca54b9e4101820f55ad2669ba6db700e2d1bd54b81b1
+size 5357249052

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d849115b9ac769b41689460305df28026834a723b2bf002a132247ef170c09f9
+size 5127218901

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
+size 11422778

DeepSeek-R1-Distill-Qwen-32B_4,8_mixed/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin��>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}