gustavoaq commited on
Commit
d445807
·
1 Parent(s): 5c5bb14

Update finetune.py

Browse files
Files changed (1) hide show
  1. finetune.py +160 -160
finetune.py CHANGED
@@ -15,170 +15,170 @@ from peft import (
15
  get_peft_model_state_dict,
16
  )
17
 
18
- # HF_TOKEN = os.environ.get("TRL_TOKEN", None)
19
- # if HF_TOKEN:
20
- # print(HF_TOKEN)
21
- # repo = Repository(
22
- # local_dir="./checkpoints/", clone_from="gustavoaq/llama_ft", use_auth_token=HF_TOKEN, repo_type="models"
23
- # )
24
- # repo.git_pull()
25
- # # Parameters
26
-
27
- # MICRO_BATCH_SIZE = 16
28
- # BATCH_SIZE = 32
29
- # size = "7b"
30
- # GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
31
- # EPOCHS = 1
32
- # LEARNING_RATE = float(0.00015)
33
- # CUTOFF_LEN = 512
34
- # LORA_R = 8
35
- # LORA_ALPHA = 16
36
- # LORA_DROPOUT = 0.05
37
- # VAL_SET_SIZE = 2000
38
- # TARGET_MODULES = [
39
- # "q_proj",
40
- # "k_proj",
41
- # "v_proj",
42
- # "down_proj",
43
- # "gate_proj",
44
- # "up_proj",
45
- # ]
46
- # DATA_PATH = "data/data_tmp.json"
47
- # OUTPUT_DIR = "checkpoints/{}".format(size)
48
-
49
- # if not os.path.exists("data"):
50
- # os.makedirs("data")
51
- # # Load data
52
- # data = []
53
- # for x in "alpaca,stackoverflow,quora".split(","):
54
- # data += json.load(open("data/{}_chat_data.json".format(x)))
55
- # random.shuffle(data)
56
- # json.dump(data, open(DATA_PATH, "w"))
57
- # data = load_dataset("json", data_files=DATA_PATH)
58
-
59
- # # Load Model
60
- # device_map = "auto"
61
- # world_size = int(os.environ.get("WORLD_SIZE", 1))
62
- # ddp = world_size != 1
63
- # if ddp:
64
- # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
65
- # GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size
66
-
67
- # model = LlamaForCausalLM.from_pretrained(
68
- # "decapoda-research/llama-{}-hf".format(size),
69
- # load_in_8bit=True,
70
- # device_map='auto',
71
- # )
72
- # total_params, params = 0, 0
73
-
74
- # tokenizer = LlamaTokenizer.from_pretrained(
75
- # "decapoda-research/llama-{}-hf".format(size), add_eos_token=True,
76
- # load_in_8bit_fp32_cpu_offload=True, device_map={0: [0]},
77
- # )
78
-
79
- # model = prepare_model_for_int8_training(model)
80
-
81
- # config = LoraConfig(
82
- # r=LORA_R,
83
- # lora_alpha=LORA_ALPHA,
84
- # target_modules=TARGET_MODULES,
85
- # lora_dropout=LORA_DROPOUT,
86
- # bias="none",
87
- # task_type="CAUSAL_LM",
88
- # )
89
- # config.save_pretrained(OUTPUT_DIR)
90
-
91
- # model = get_peft_model(model, config)
92
- # tokenizer.pad_token_id = 0
93
-
94
- # for n, p in model.model.named_parameters():
95
- # if any([x in n for x in ["lora"]]):
96
- # total_params += p.numel()
97
- # params += p.numel()
98
-
99
- # print(
100
- # "Total number of parameters: {}M, rate: {}%".format(
101
- # total_params // 1000 / 1000, round(total_params / params * 100, 2)
102
- # )
103
- # )
104
-
105
-
106
- # # Data Preprocess
107
- # def generate_prompt(data_point):
108
- # return data_point["input"]
109
-
110
-
111
- # def tokenize(prompt):
112
- # result = tokenizer(
113
- # prompt,
114
- # truncation=True,
115
- # max_length=CUTOFF_LEN + 1,
116
- # padding="max_length",
117
- # )
118
- # return {
119
- # "input_ids": result["input_ids"][:-1],
120
- # "attention_mask": result["attention_mask"][:-1],
121
- # }
122
-
123
-
124
- # def generate_and_tokenize_prompt(data_point):
125
- # prompt = generate_prompt(data_point)
126
- # return tokenize(prompt)
127
-
128
-
129
- # if VAL_SET_SIZE > 0:
130
- # train_val = data["train"].train_test_split(
131
- # test_size=VAL_SET_SIZE, shuffle=True, seed=42
132
- # )
133
- # train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
134
- # val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
135
- # else:
136
- # train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
137
- # val_data = None
138
-
139
-
140
- # # Training
141
- # trainer = transformers.Trainer(
142
- # model=model,
143
- # train_dataset=train_data,
144
- # eval_dataset=val_data,
145
- # args=transformers.TrainingArguments(
146
- # per_device_train_batch_size=MICRO_BATCH_SIZE,
147
- # per_device_eval_batch_size=MICRO_BATCH_SIZE,
148
- # gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
149
- # warmup_steps=100,
150
- # num_train_epochs=EPOCHS,
151
- # learning_rate=LEARNING_RATE,
152
- # fp16=True,
153
- # logging_steps=20,
154
- # evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no",
155
- # save_strategy="steps",
156
- # eval_steps=200 if VAL_SET_SIZE > 0 else None,
157
- # save_steps=200,
158
- # output_dir=OUTPUT_DIR,
159
- # save_total_limit=100,
160
- # load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
161
- # ddp_find_unused_parameters=False if ddp else None,
162
- # ),
163
- # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
164
- # )
165
- # model.config.use_cache = False
166
-
167
- # old_state_dict = model.state_dict
168
- # model.state_dict = (
169
- # lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
170
- # ).__get__(model, type(model))
171
 
172
  import gradio as gr
173
 
174
  def train():
175
- print("Event Dispared")
176
- # print(os.listdir(OUTPUT_DIR))
177
- # # Call your trainer's train() function here
178
- # trainer.train()
179
- # print("Training complete.") # optional message to display when training is done
180
- # model.save_pretrained(OUTPUT_DIR)
181
- # repo.push_to_hub(OUTPUT_DIR, commit_message="Ft model")
182
 
183
  iface = gr.Interface(
184
  fn=train,
 
15
  get_peft_model_state_dict,
16
  )
17
 
18
+ HF_TOKEN = os.environ.get("TRL_TOKEN", None)
19
+ if HF_TOKEN:
20
+ print(HF_TOKEN)
21
+ repo = Repository(
22
+ local_dir="./checkpoints/", clone_from="gustavoaq/llama_ft", use_auth_token=HF_TOKEN, repo_type="models"
23
+ )
24
+ repo.git_pull()
25
+ # Parameters
26
+
27
+ MICRO_BATCH_SIZE = 16
28
+ BATCH_SIZE = 32
29
+ size = "7b"
30
+ GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
31
+ EPOCHS = 1
32
+ LEARNING_RATE = float(0.00015)
33
+ CUTOFF_LEN = 512
34
+ LORA_R = 8
35
+ LORA_ALPHA = 16
36
+ LORA_DROPOUT = 0.05
37
+ VAL_SET_SIZE = 2000
38
+ TARGET_MODULES = [
39
+ "q_proj",
40
+ "k_proj",
41
+ "v_proj",
42
+ "down_proj",
43
+ "gate_proj",
44
+ "up_proj",
45
+ ]
46
+ DATA_PATH = "data/data_tmp.json"
47
+ OUTPUT_DIR = "checkpoints/{}".format(size)
48
+
49
+ if not os.path.exists("data"):
50
+ os.makedirs("data")
51
+ # Load data
52
+ data = []
53
+ for x in "alpaca,stackoverflow,quora".split(","):
54
+ data += json.load(open("data/{}_chat_data.json".format(x)))
55
+ random.shuffle(data)
56
+ json.dump(data, open(DATA_PATH, "w"))
57
+ data = load_dataset("json", data_files=DATA_PATH)
58
+
59
+ # Load Model
60
+ device_map = "auto"
61
+ world_size = int(os.environ.get("WORLD_SIZE", 1))
62
+ ddp = world_size != 1
63
+ if ddp:
64
+ device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
65
+ GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size
66
+
67
+ model = LlamaForCausalLM.from_pretrained(
68
+ "decapoda-research/llama-{}-hf".format(size),
69
+ load_in_8bit=True,
70
+ device_map='auto',
71
+ )
72
+ total_params, params = 0, 0
73
+
74
+ tokenizer = LlamaTokenizer.from_pretrained(
75
+ "decapoda-research/llama-{}-hf".format(size), add_eos_token=True,
76
+ load_in_8bit_fp32_cpu_offload=True, device_map={0: [0]},
77
+ )
78
+
79
+ model = prepare_model_for_int8_training(model)
80
+
81
+ config = LoraConfig(
82
+ r=LORA_R,
83
+ lora_alpha=LORA_ALPHA,
84
+ target_modules=TARGET_MODULES,
85
+ lora_dropout=LORA_DROPOUT,
86
+ bias="none",
87
+ task_type="CAUSAL_LM",
88
+ )
89
+ config.save_pretrained(OUTPUT_DIR)
90
+
91
+ model = get_peft_model(model, config)
92
+ tokenizer.pad_token_id = 0
93
+
94
+ for n, p in model.model.named_parameters():
95
+ if any([x in n for x in ["lora"]]):
96
+ total_params += p.numel()
97
+ params += p.numel()
98
+
99
+ print(
100
+ "Total number of parameters: {}M, rate: {}%".format(
101
+ total_params // 1000 / 1000, round(total_params / params * 100, 2)
102
+ )
103
+ )
104
+
105
+
106
+ # Data Preprocess
107
+ def generate_prompt(data_point):
108
+ return data_point["input"]
109
+
110
+
111
+ def tokenize(prompt):
112
+ result = tokenizer(
113
+ prompt,
114
+ truncation=True,
115
+ max_length=CUTOFF_LEN + 1,
116
+ padding="max_length",
117
+ )
118
+ return {
119
+ "input_ids": result["input_ids"][:-1],
120
+ "attention_mask": result["attention_mask"][:-1],
121
+ }
122
+
123
+
124
+ def generate_and_tokenize_prompt(data_point):
125
+ prompt = generate_prompt(data_point)
126
+ return tokenize(prompt)
127
+
128
+
129
+ if VAL_SET_SIZE > 0:
130
+ train_val = data["train"].train_test_split(
131
+ test_size=VAL_SET_SIZE, shuffle=True, seed=42
132
+ )
133
+ train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
134
+ val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
135
+ else:
136
+ train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
137
+ val_data = None
138
+
139
+
140
+ # Training
141
+ trainer = transformers.Trainer(
142
+ model=model,
143
+ train_dataset=train_data,
144
+ eval_dataset=val_data,
145
+ args=transformers.TrainingArguments(
146
+ per_device_train_batch_size=MICRO_BATCH_SIZE,
147
+ per_device_eval_batch_size=MICRO_BATCH_SIZE,
148
+ gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
149
+ warmup_steps=100,
150
+ num_train_epochs=EPOCHS,
151
+ learning_rate=LEARNING_RATE,
152
+ fp16=True,
153
+ logging_steps=20,
154
+ evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no",
155
+ save_strategy="steps",
156
+ eval_steps=200 if VAL_SET_SIZE > 0 else None,
157
+ save_steps=200,
158
+ output_dir=OUTPUT_DIR,
159
+ save_total_limit=100,
160
+ load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
161
+ ddp_find_unused_parameters=False if ddp else None,
162
+ ),
163
+ data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
164
+ )
165
+ model.config.use_cache = False
166
+
167
+ old_state_dict = model.state_dict
168
+ model.state_dict = (
169
+ lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
170
+ ).__get__(model, type(model))
171
 
172
  import gradio as gr
173
 
174
  def train():
175
+ print(os.listdir(OUTPUT_DIR))
176
+ # Call your trainer's train() function here
177
+ trainer.train()
178
+ print("Training complete.") # optional message to display when training is done
179
+ model.save_pretrained(OUTPUT_DIR)
180
+ repo.push_to_hub(OUTPUT_DIR, commit_message="Ft model")
181
+
182
 
183
  iface = gr.Interface(
184
  fn=train,