gustavoaq commited on
Commit
5c5bb14
·
1 Parent(s): 3bb2a81

Update finetune.py

Browse files
Files changed (1) hide show
  1. finetune.py +163 -162
finetune.py CHANGED
@@ -15,176 +15,177 @@ from peft import (
15
  get_peft_model_state_dict,
16
  )
17
 
18
- HF_TOKEN = os.environ.get("TRL_TOKEN", None)
19
- if HF_TOKEN:
20
- print(HF_TOKEN)
21
- repo = Repository(
22
- local_dir="./checkpoints/", clone_from="gustavoaq/llama_ft", use_auth_token=HF_TOKEN, repo_type="models"
23
- )
24
- repo.git_pull()
25
- # Parameters
26
-
27
- MICRO_BATCH_SIZE = 16
28
- BATCH_SIZE = 32
29
- size = "7b"
30
- GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
31
- EPOCHS = 1
32
- LEARNING_RATE = float(0.00015)
33
- CUTOFF_LEN = 512
34
- LORA_R = 8
35
- LORA_ALPHA = 16
36
- LORA_DROPOUT = 0.05
37
- VAL_SET_SIZE = 2000
38
- TARGET_MODULES = [
39
- "q_proj",
40
- "k_proj",
41
- "v_proj",
42
- "down_proj",
43
- "gate_proj",
44
- "up_proj",
45
- ]
46
- DATA_PATH = "data/data_tmp.json"
47
- OUTPUT_DIR = "checkpoints/{}".format(size)
48
-
49
- if not os.path.exists("data"):
50
- os.makedirs("data")
51
- # Load data
52
- data = []
53
- for x in "alpaca,stackoverflow,quora".split(","):
54
- data += json.load(open("data/{}_chat_data.json".format(x)))
55
- random.shuffle(data)
56
- json.dump(data, open(DATA_PATH, "w"))
57
- data = load_dataset("json", data_files=DATA_PATH)
58
-
59
- # Load Model
60
- device_map = "auto"
61
- world_size = int(os.environ.get("WORLD_SIZE", 1))
62
- ddp = world_size != 1
63
- if ddp:
64
- device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
65
- GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size
66
-
67
- model = LlamaForCausalLM.from_pretrained(
68
- "decapoda-research/llama-{}-hf".format(size),
69
- load_in_8bit=True,
70
- device_map='auto',
71
- )
72
- total_params, params = 0, 0
73
-
74
- tokenizer = LlamaTokenizer.from_pretrained(
75
- "decapoda-research/llama-{}-hf".format(size), add_eos_token=True,
76
- load_in_8bit_fp32_cpu_offload=True, device_map={0: [0]},
77
- )
78
-
79
- model = prepare_model_for_int8_training(model)
80
-
81
- config = LoraConfig(
82
- r=LORA_R,
83
- lora_alpha=LORA_ALPHA,
84
- target_modules=TARGET_MODULES,
85
- lora_dropout=LORA_DROPOUT,
86
- bias="none",
87
- task_type="CAUSAL_LM",
88
- )
89
- config.save_pretrained(OUTPUT_DIR)
90
-
91
- model = get_peft_model(model, config)
92
- tokenizer.pad_token_id = 0
93
-
94
- for n, p in model.model.named_parameters():
95
- if any([x in n for x in ["lora"]]):
96
- total_params += p.numel()
97
- params += p.numel()
98
-
99
- print(
100
- "Total number of parameters: {}M, rate: {}%".format(
101
- total_params // 1000 / 1000, round(total_params / params * 100, 2)
102
- )
103
- )
104
-
105
-
106
- # Data Preprocess
107
- def generate_prompt(data_point):
108
- return data_point["input"]
109
-
110
-
111
- def tokenize(prompt):
112
- result = tokenizer(
113
- prompt,
114
- truncation=True,
115
- max_length=CUTOFF_LEN + 1,
116
- padding="max_length",
117
- )
118
- return {
119
- "input_ids": result["input_ids"][:-1],
120
- "attention_mask": result["attention_mask"][:-1],
121
- }
122
-
123
-
124
- def generate_and_tokenize_prompt(data_point):
125
- prompt = generate_prompt(data_point)
126
- return tokenize(prompt)
127
-
128
-
129
- if VAL_SET_SIZE > 0:
130
- train_val = data["train"].train_test_split(
131
- test_size=VAL_SET_SIZE, shuffle=True, seed=42
132
- )
133
- train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
134
- val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
135
- else:
136
- train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
137
- val_data = None
138
-
139
-
140
- # Training
141
- trainer = transformers.Trainer(
142
- model=model,
143
- train_dataset=train_data,
144
- eval_dataset=val_data,
145
- args=transformers.TrainingArguments(
146
- per_device_train_batch_size=MICRO_BATCH_SIZE,
147
- per_device_eval_batch_size=MICRO_BATCH_SIZE,
148
- gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
149
- warmup_steps=100,
150
- num_train_epochs=EPOCHS,
151
- learning_rate=LEARNING_RATE,
152
- fp16=True,
153
- logging_steps=20,
154
- evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no",
155
- save_strategy="steps",
156
- eval_steps=200 if VAL_SET_SIZE > 0 else None,
157
- save_steps=200,
158
- output_dir=OUTPUT_DIR,
159
- save_total_limit=100,
160
- load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
161
- ddp_find_unused_parameters=False if ddp else None,
162
- ),
163
- data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
164
- )
165
- model.config.use_cache = False
166
-
167
- old_state_dict = model.state_dict
168
- model.state_dict = (
169
- lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
170
- ).__get__(model, type(model))
171
 
172
  import gradio as gr
173
 
174
  def train():
175
- print(os.listdir(OUTPUT_DIR))
176
- # Call your trainer's train() function here
177
- trainer.train()
178
- print("Training complete.") # optional message to display when training is done
179
- model.save_pretrained(OUTPUT_DIR)
180
- repo.push_to_hub(OUTPUT_DIR, commit_message="Ft model")
 
181
 
182
  iface = gr.Interface(
183
  fn=train,
184
- inputs=None,
185
- outputs=None,
186
  title="Training Interface",
187
- description="Click the button to start training.",
188
  theme="default",
189
  layout="vertical",
190
  allow_flagging=False,
 
15
  get_peft_model_state_dict,
16
  )
17
 
18
+ # HF_TOKEN = os.environ.get("TRL_TOKEN", None)
19
+ # if HF_TOKEN:
20
+ # print(HF_TOKEN)
21
+ # repo = Repository(
22
+ # local_dir="./checkpoints/", clone_from="gustavoaq/llama_ft", use_auth_token=HF_TOKEN, repo_type="models"
23
+ # )
24
+ # repo.git_pull()
25
+ # # Parameters
26
+
27
+ # MICRO_BATCH_SIZE = 16
28
+ # BATCH_SIZE = 32
29
+ # size = "7b"
30
+ # GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
31
+ # EPOCHS = 1
32
+ # LEARNING_RATE = float(0.00015)
33
+ # CUTOFF_LEN = 512
34
+ # LORA_R = 8
35
+ # LORA_ALPHA = 16
36
+ # LORA_DROPOUT = 0.05
37
+ # VAL_SET_SIZE = 2000
38
+ # TARGET_MODULES = [
39
+ # "q_proj",
40
+ # "k_proj",
41
+ # "v_proj",
42
+ # "down_proj",
43
+ # "gate_proj",
44
+ # "up_proj",
45
+ # ]
46
+ # DATA_PATH = "data/data_tmp.json"
47
+ # OUTPUT_DIR = "checkpoints/{}".format(size)
48
+
49
+ # if not os.path.exists("data"):
50
+ # os.makedirs("data")
51
+ # # Load data
52
+ # data = []
53
+ # for x in "alpaca,stackoverflow,quora".split(","):
54
+ # data += json.load(open("data/{}_chat_data.json".format(x)))
55
+ # random.shuffle(data)
56
+ # json.dump(data, open(DATA_PATH, "w"))
57
+ # data = load_dataset("json", data_files=DATA_PATH)
58
+
59
+ # # Load Model
60
+ # device_map = "auto"
61
+ # world_size = int(os.environ.get("WORLD_SIZE", 1))
62
+ # ddp = world_size != 1
63
+ # if ddp:
64
+ # device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
65
+ # GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size
66
+
67
+ # model = LlamaForCausalLM.from_pretrained(
68
+ # "decapoda-research/llama-{}-hf".format(size),
69
+ # load_in_8bit=True,
70
+ # device_map='auto',
71
+ # )
72
+ # total_params, params = 0, 0
73
+
74
+ # tokenizer = LlamaTokenizer.from_pretrained(
75
+ # "decapoda-research/llama-{}-hf".format(size), add_eos_token=True,
76
+ # load_in_8bit_fp32_cpu_offload=True, device_map={0: [0]},
77
+ # )
78
+
79
+ # model = prepare_model_for_int8_training(model)
80
+
81
+ # config = LoraConfig(
82
+ # r=LORA_R,
83
+ # lora_alpha=LORA_ALPHA,
84
+ # target_modules=TARGET_MODULES,
85
+ # lora_dropout=LORA_DROPOUT,
86
+ # bias="none",
87
+ # task_type="CAUSAL_LM",
88
+ # )
89
+ # config.save_pretrained(OUTPUT_DIR)
90
+
91
+ # model = get_peft_model(model, config)
92
+ # tokenizer.pad_token_id = 0
93
+
94
+ # for n, p in model.model.named_parameters():
95
+ # if any([x in n for x in ["lora"]]):
96
+ # total_params += p.numel()
97
+ # params += p.numel()
98
+
99
+ # print(
100
+ # "Total number of parameters: {}M, rate: {}%".format(
101
+ # total_params // 1000 / 1000, round(total_params / params * 100, 2)
102
+ # )
103
+ # )
104
+
105
+
106
+ # # Data Preprocess
107
+ # def generate_prompt(data_point):
108
+ # return data_point["input"]
109
+
110
+
111
+ # def tokenize(prompt):
112
+ # result = tokenizer(
113
+ # prompt,
114
+ # truncation=True,
115
+ # max_length=CUTOFF_LEN + 1,
116
+ # padding="max_length",
117
+ # )
118
+ # return {
119
+ # "input_ids": result["input_ids"][:-1],
120
+ # "attention_mask": result["attention_mask"][:-1],
121
+ # }
122
+
123
+
124
+ # def generate_and_tokenize_prompt(data_point):
125
+ # prompt = generate_prompt(data_point)
126
+ # return tokenize(prompt)
127
+
128
+
129
+ # if VAL_SET_SIZE > 0:
130
+ # train_val = data["train"].train_test_split(
131
+ # test_size=VAL_SET_SIZE, shuffle=True, seed=42
132
+ # )
133
+ # train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
134
+ # val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
135
+ # else:
136
+ # train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
137
+ # val_data = None
138
+
139
+
140
+ # # Training
141
+ # trainer = transformers.Trainer(
142
+ # model=model,
143
+ # train_dataset=train_data,
144
+ # eval_dataset=val_data,
145
+ # args=transformers.TrainingArguments(
146
+ # per_device_train_batch_size=MICRO_BATCH_SIZE,
147
+ # per_device_eval_batch_size=MICRO_BATCH_SIZE,
148
+ # gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
149
+ # warmup_steps=100,
150
+ # num_train_epochs=EPOCHS,
151
+ # learning_rate=LEARNING_RATE,
152
+ # fp16=True,
153
+ # logging_steps=20,
154
+ # evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no",
155
+ # save_strategy="steps",
156
+ # eval_steps=200 if VAL_SET_SIZE > 0 else None,
157
+ # save_steps=200,
158
+ # output_dir=OUTPUT_DIR,
159
+ # save_total_limit=100,
160
+ # load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
161
+ # ddp_find_unused_parameters=False if ddp else None,
162
+ # ),
163
+ # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
164
+ # )
165
+ # model.config.use_cache = False
166
+
167
+ # old_state_dict = model.state_dict
168
+ # model.state_dict = (
169
+ # lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
170
+ # ).__get__(model, type(model))
171
 
172
  import gradio as gr
173
 
174
  def train():
175
+ print("Event Dispared")
176
+ # print(os.listdir(OUTPUT_DIR))
177
+ # # Call your trainer's train() function here
178
+ # trainer.train()
179
+ # print("Training complete.") # optional message to display when training is done
180
+ # model.save_pretrained(OUTPUT_DIR)
181
+ # repo.push_to_hub(OUTPUT_DIR, commit_message="Ft model")
182
 
183
  iface = gr.Interface(
184
  fn=train,
185
+ inputs=gr.inputs.Textbox(label="Input text"),
186
+ outputs=gr.outputs.Textbox(label="Output length"),
187
  title="Training Interface",
188
+ description="Enter some text and click the button to start training.",
189
  theme="default",
190
  layout="vertical",
191
  allow_flagging=False,