Update README.md

Browse files

Files changed (1) hide show

README.md +19 -79

README.md CHANGED Viewed

@@ -55,90 +55,30 @@ This quant was created using llmcompressor.
 Code below.
 ```python
-import torch
-from datasets import load_dataset
 from transformers import AutoTokenizer
-from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
-from llmcompressor.transformers.compression.helpers import (
-    calculate_offload_device_map,
-    custom_offload_device_map,
-)
-recipe = """
-quant_stage:
-    quant_modifiers:
-        QuantizationModifier:
-            ignore: ["lm_head"]
-            config_groups:
-                group_0:
-                    weights:
-                        num_bits: 8
-                        type: float
-                        strategy: tensor
-                        dynamic: false
-                        symmetric: true
-                    input_activations:
-                        num_bits: 8
-                        type: float
-                        strategy: tensor
-                        dynamic: false
-                        symmetric: true
-                    targets: ["Linear"]
-"""
-model_stub = "NousResearch/Hermes-3-Llama-3.1-8B"
-model_name = model_stub.split("/")[-1]
-device_map = calculate_offload_device_map(
-    model_stub, reserve_for_hessians=False, num_gpus=1, torch_dtype="auto"
-)
 model = SparseAutoModelForCausalLM.from_pretrained(
-    model_stub, torch_dtype="auto", device_map=device_map
-)
-tokenizer = AutoTokenizer.from_pretrained(model_stub)
-output_dir = f"./{model_name}-FP8"
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 4096
-ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-ds = ds.map(preprocess)
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-oneshot(
-    model=model,
-    output_dir=output_dir,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-)
 ```

 Code below.
 ```python
+from llmcompressor.transformers import SparseAutoModelForCausalLM
 from transformers import AutoTokenizer
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
 model = SparseAutoModelForCausalLM.from_pretrained(
+  MODEL_ID, device_map="auto", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+# Configure the simple PTQ quantization
+recipe = QuantizationModifier(
+  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+# Apply the quantization algorithm.
+oneshot(model=model, recipe=recipe)
+# Save the model.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
 ```