muhtasham commited on
Commit
ed15eb4
1 Parent(s): 4cd11de

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +89 -0
README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ import time
3
+ from datasets import load_dataset
4
+ from transformers import AutoTokenizer
5
+ from rich import print
6
+ from llmcompressor.modifiers.quantization import GPTQModifier
7
+ from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
8
+
9
+ # Select model and load it.
10
+ MODEL_ID = "Unbabel/TowerInstruct-7B-v0.1"
11
+ model = SparseAutoModelForCausalLM.from_pretrained(
12
+ MODEL_ID,
13
+ device_map="auto",
14
+ torch_dtype="auto",
15
+ )
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
17
+
18
+ # Select calibration dataset.
19
+ DATASET_ID = "neuralmagic/LLM_compression_calibration"
20
+ DATASET_SPLIT = "train"
21
+
22
+ # Select number of samples. 512 samples is a good place to start.
23
+ # Increasing the number of samples can improve accuracy.
24
+ NUM_CALIBRATION_SAMPLES = 756
25
+ MAX_SEQUENCE_LENGTH = 2048
26
+
27
+ # Load dataset and preprocess.
28
+ ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
29
+ ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
30
+
31
+
32
+ def preprocess(example):
33
+ return {
34
+ "text": tokenizer.apply_chat_template(
35
+ example["messages"],
36
+ tokenize=False,
37
+ )
38
+ }
39
+
40
+
41
+ ds = ds.map(preprocess)
42
+
43
+
44
+ # Tokenize inputs.
45
+ def tokenize(sample):
46
+ return tokenizer(
47
+ sample["text"],
48
+ padding=False,
49
+ max_length=MAX_SEQUENCE_LENGTH,
50
+ truncation=True,
51
+ add_special_tokens=False,
52
+ )
53
+
54
+
55
+ ds = ds.map(tokenize, remove_columns=ds.column_names)
56
+
57
+ # Configure the quantization algorithm to run.
58
+ # * quantize the weights to 4 bit with GPTQ with a group size 128
59
+ # Note: to reduce GPU memory use `sequential_update=False`
60
+ recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
61
+ print(recipe)
62
+
63
+ # Apply algorithms.
64
+ oneshot(
65
+ model=model,
66
+ dataset=ds,
67
+ recipe=recipe,
68
+ max_seq_length=MAX_SEQUENCE_LENGTH,
69
+ num_calibration_samples=NUM_CALIBRATION_SAMPLES,
70
+ )
71
+
72
+ # Confirm generations of the quantized model look sane and measure generation time.
73
+ print("\n\n")
74
+ print("========== SAMPLE GENERATION ==============")
75
+ input_text = "Translate the following text from Portuguese into English.\nPortuguese: Um grupo de investigadores lançou um novo modelo para tarefas relacionadas com tradução.\nEnglish:"
76
+ input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
77
+ start_time = time.time()
78
+ output = model.generate(input_ids, max_new_tokens=100)
79
+ end_time = time.time()
80
+ generation_time = end_time - start_time
81
+ print(tokenizer.decode(output[0]))
82
+ print(f"Generation time: {generation_time:.2f} seconds")
83
+ print("==========================================\n\n")
84
+
85
+ # Save to disk compressed.
86
+ SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
87
+ model.save_pretrained(SAVE_DIR, save_compressed=True)
88
+ tokenizer.save_pretrained(SAVE_DIR)
89
+ ```