dh-mc commited on
Commit
d028752
·
1 Parent(s): cf912f1

rtx4090 0-shot

Browse files
data/Llama3.1-8B-Chinese-Chat_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/Mistral-7B-v0.3-Chinese-Chat_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/Qwen2-7B-Instruct_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/internlm2_5-7b-chat-1m_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/internlm2_5-7b-chat_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
llm_toolkit/eval_shots.py CHANGED
@@ -117,6 +117,7 @@ def evaluate_model_with_num_shots(
117
  tokenizer=tokenizer,
118
  chinese_prompt=not use_english_datasets,
119
  using_p1=False,
 
120
  )
121
  if len(sys.argv) > 1:
122
  num = int(sys.argv[1])
 
117
  tokenizer=tokenizer,
118
  chinese_prompt=not use_english_datasets,
119
  using_p1=False,
120
+ num_shots=num_shots,
121
  )
122
  if len(sys.argv) > 1:
123
  num = int(sys.argv[1])
llm_toolkit/logical_reasoning_utils.py CHANGED
@@ -263,7 +263,12 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
263
 
264
 
265
  def load_logical_reasoning_dataset(
266
- data_path, tokenizer=None, using_p1=True, chinese_prompt=True, test_data=None
 
 
 
 
 
267
  ):
268
  postfix = "" if chinese_prompt else "_en"
269
  train_data_file = data_path + f"/train{postfix}.csv"
@@ -276,7 +281,11 @@ def load_logical_reasoning_dataset(
276
  )
277
 
278
  if tokenizer:
279
- reasoning_prompt = get_prompt_template(using_p1, chinese_prompt)
 
 
 
 
280
 
281
  def formatting_prompts_func(examples):
282
  inputs = examples["text"]
 
263
 
264
 
265
  def load_logical_reasoning_dataset(
266
+ data_path,
267
+ tokenizer=None,
268
+ using_p1=True,
269
+ chinese_prompt=True,
270
+ test_data=None,
271
+ num_shots=0,
272
  ):
273
  postfix = "" if chinese_prompt else "_en"
274
  train_data_file = data_path + f"/train{postfix}.csv"
 
281
  )
282
 
283
  if tokenizer:
284
+ reasoning_prompt = (
285
+ get_prompt_template(using_p1, chinese_prompt)
286
+ if num_shots == 0
287
+ else get_few_shot_prompt_template(num_shots, datasets["train"].to_pandas())
288
+ )
289
 
290
  def formatting_prompts_func(examples):
291
  inputs = examples["text"]
llm_toolkit/translation_engine.py DELETED
@@ -1,130 +0,0 @@
1
- import os
2
- import pandas as pd
3
- import torch
4
- from unsloth import FastLanguageModel, is_bfloat16_supported
5
- from trl import SFTTrainer
6
- from transformers import TrainingArguments, TextStreamer
7
- from llm_toolkit.translation_utils import *
8
- from llamafactory.chat import ChatModel
9
-
10
- print(f"loading {__file__}")
11
-
12
-
13
- def get_model_names(
14
- model_name, save_method="merged_4bit_forced", quantization_method="q5_k_m"
15
- ):
16
- hub_model = model_name.split("/")[-1] + "-MAC-"
17
- local_model = "models/" + hub_model
18
-
19
- return {
20
- "local": local_model + save_method,
21
- "local-gguf": local_model + quantization_method,
22
- "hub": hub_model + save_method,
23
- "hub-gguf": hub_model + "gguf-" + quantization_method,
24
- }
25
-
26
-
27
- def load_model(
28
- model_name,
29
- max_seq_length=2048,
30
- dtype=None,
31
- load_in_4bit=False,
32
- template="chatml",
33
- adapter_name_or_path=None,
34
- ):
35
- print(f"loading model: {model_name}")
36
-
37
- if adapter_name_or_path:
38
- args = dict(
39
- model_name_or_path=model_name,
40
- adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
41
- template=template, # same to the one in training
42
- finetuning_type="lora", # same to the one in training
43
- quantization_bit=4, # load 4-bit quantized model
44
- )
45
- chat_model = ChatModel(args)
46
- return chat_model.engine.model, chat_model.engine.tokenizer
47
-
48
- model, tokenizer = FastLanguageModel.from_pretrained(
49
- model_name=model_name, # YOUR MODEL YOU USED FOR TRAINING
50
- max_seq_length=max_seq_length,
51
- dtype=dtype,
52
- load_in_4bit=load_in_4bit,
53
- trust_remote_code=True,
54
- )
55
- FastLanguageModel.for_inference(model)
56
-
57
- return model, tokenizer
58
-
59
-
60
- def test_model(model, tokenizer, prompt):
61
- inputs = tokenizer(
62
- [prompt],
63
- return_tensors="pt",
64
- ).to("cuda")
65
-
66
- text_streamer = TextStreamer(tokenizer)
67
-
68
- _ = model.generate(
69
- **inputs, max_new_tokens=128, streamer=text_streamer, use_cache=True
70
- )
71
-
72
-
73
- def load_trainer(
74
- model,
75
- tokenizer,
76
- dataset,
77
- num_train_epochs,
78
- max_seq_length=2048,
79
- fp16=False,
80
- bf16=False,
81
- output_dir="./outputs",
82
- ):
83
- model = FastLanguageModel.get_peft_model(
84
- model,
85
- r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
86
- target_modules=[
87
- "q_proj",
88
- "k_proj",
89
- "v_proj",
90
- "o_proj",
91
- "gate_proj",
92
- "up_proj",
93
- "down_proj",
94
- ],
95
- lora_alpha=16,
96
- lora_dropout=0, # Supports any, but = 0 is optimized
97
- bias="none", # Supports any, but = "none" is optimized
98
- # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
99
- use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
100
- random_state=3407,
101
- use_rslora=False, # We support rank stabilized LoRA
102
- loftq_config=None, # And LoftQ
103
- )
104
-
105
- trainer = SFTTrainer(
106
- model=model,
107
- tokenizer=tokenizer,
108
- train_dataset=dataset,
109
- dataset_text_field="text",
110
- max_seq_length=max_seq_length,
111
- dataset_num_proc=2,
112
- packing=False, # Can make training 5x faster for short sequences.
113
- args=TrainingArguments(
114
- per_device_train_batch_size=2,
115
- gradient_accumulation_steps=4,
116
- warmup_steps=5,
117
- num_train_epochs=num_train_epochs,
118
- learning_rate=2e-4,
119
- fp16=not is_bfloat16_supported(),
120
- bf16=is_bfloat16_supported(),
121
- logging_steps=100,
122
- optim="adamw_8bit",
123
- weight_decay=0.01,
124
- lr_scheduler_type="linear",
125
- seed=3407,
126
- output_dir=output_dir,
127
- ),
128
- )
129
-
130
- return trainer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm_toolkit/translation_utils.py DELETED
@@ -1,420 +0,0 @@
1
- import os
2
- import re
3
- import pandas as pd
4
- import evaluate
5
- import seaborn as sns
6
- import matplotlib.pyplot as plt
7
- from datasets import load_dataset
8
- from langchain_openai import ChatOpenAI
9
- from langchain_core.prompts import ChatPromptTemplate
10
- from tqdm import tqdm
11
-
12
- print(f"loading {__file__}")
13
-
14
- bleu = evaluate.load("bleu")
15
- rouge = evaluate.load("rouge")
16
- meteor = evaluate.load("meteor")
17
- accuracy = evaluate.load("accuracy")
18
-
19
-
20
- def extract_answer(text, debug=False):
21
- if text:
22
- # Remove the begin and end tokens
23
- text = re.sub(
24
- r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
25
- )
26
- if debug:
27
- print("--------\nstep 1:", text)
28
-
29
- text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
30
- if debug:
31
- print("--------\nstep 2:", text)
32
-
33
- text = re.sub(
34
- r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
35
- )
36
- if debug:
37
- print("--------\nstep 3:", text)
38
-
39
- return text
40
-
41
-
42
- def calc_metrics(references, predictions, debug=False):
43
- assert len(references) == len(
44
- predictions
45
- ), f"lengths are difference: {len(references)} != {len(predictions)}"
46
-
47
- predictions = [extract_answer(text) for text in predictions]
48
-
49
- correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
50
- accuracy = sum(correct) / len(references)
51
-
52
- results = {"accuracy": accuracy}
53
- if debug:
54
- correct_ids = [i for i, c in enumerate(correct) if c == 1]
55
- results["correct_ids"] = correct_ids
56
-
57
- results["meteor"] = meteor.compute(predictions=predictions, references=references)[
58
- "meteor"
59
- ]
60
-
61
- results["bleu_scores"] = bleu.compute(
62
- predictions=predictions, references=references, max_order=4
63
- )
64
- results["rouge_scores"] = rouge.compute(
65
- predictions=predictions, references=references
66
- )
67
- return results
68
-
69
-
70
- def save_results(model_name, results_path, dataset, predictions, debug=False):
71
- if not os.path.exists(results_path):
72
- # Get the directory part of the file path
73
- dir_path = os.path.dirname(results_path)
74
-
75
- # Create all directories in the path (if they don't exist)
76
- os.makedirs(dir_path, exist_ok=True)
77
- df = dataset.to_pandas()
78
- df.drop(columns=["text", "prompt"], inplace=True)
79
- else:
80
- df = pd.read_csv(results_path, on_bad_lines="warn")
81
-
82
- df[model_name] = predictions
83
-
84
- if debug:
85
- print(df.head(1))
86
-
87
- df.to_csv(results_path, index=False)
88
-
89
-
90
- def load_translation_dataset(data_path, tokenizer=None):
91
- train_data_file = data_path.replace(".tsv", "-train.tsv")
92
- test_data_file = data_path.replace(".tsv", "-test.tsv")
93
-
94
- if not os.path.exists(train_data_file):
95
- print("generating train/test data files")
96
- dataset = load_dataset(
97
- "csv", data_files=data_path, delimiter="\t", split="train"
98
- )
99
- print(len(dataset))
100
- dataset = dataset.filter(lambda x: x["chinese"] and x["english"])
101
-
102
- datasets = dataset.train_test_split(test_size=0.2)
103
- print(len(dataset))
104
-
105
- # Convert to pandas DataFrame
106
- train_df = pd.DataFrame(datasets["train"])
107
- test_df = pd.DataFrame(datasets["test"])
108
-
109
- # Save to TSV
110
- train_df.to_csv(train_data_file, sep="\t", index=False)
111
- test_df.to_csv(test_data_file, sep="\t", index=False)
112
-
113
- print("loading train/test data files")
114
- datasets = load_dataset(
115
- "csv",
116
- data_files={"train": train_data_file, "test": test_data_file},
117
- delimiter="\t",
118
- )
119
-
120
- if tokenizer:
121
- translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
122
-
123
- def formatting_prompts_func(examples):
124
- inputs = examples["chinese"]
125
- outputs = examples["english"]
126
-
127
- messages = [
128
- {
129
- "role": "system",
130
- "content": "You are an expert in translating Chinese to English.",
131
- },
132
- None,
133
- ]
134
-
135
- model_name = os.getenv("MODEL_NAME")
136
-
137
- if "mistral" in model_name.lower():
138
- messages = messages[1:]
139
-
140
- texts = []
141
- prompts = []
142
- for input, output in zip(inputs, outputs):
143
- prompt = translation_prompt.format(input)
144
- messages[-1] = {"role": "user", "content": prompt}
145
-
146
- prompt = tokenizer.apply_chat_template(
147
- messages, tokenize=False, add_generation_prompt=True
148
- )
149
- prompts.append(prompt)
150
- texts.append(prompt + output + tokenizer.eos_token)
151
- return {"text": texts, "prompt": prompts}
152
-
153
- datasets = datasets.map(
154
- formatting_prompts_func,
155
- batched=True,
156
- )
157
-
158
- print(datasets)
159
- return datasets
160
-
161
-
162
- def eval_model(model, tokenizer, eval_dataset):
163
- total = len(eval_dataset)
164
- predictions = []
165
- for i in tqdm(range(total)):
166
- inputs = tokenizer(
167
- eval_dataset["prompt"][i : i + 1],
168
- return_tensors="pt",
169
- ).to("cuda")
170
-
171
- outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
172
- decoded_output = tokenizer.batch_decode(outputs)
173
- debug = i == 0
174
- decoded_output = [
175
- extract_answer(output, debug=debug) for output in decoded_output
176
- ]
177
- predictions.extend(decoded_output)
178
-
179
- return predictions
180
-
181
-
182
- def save_model(
183
- model,
184
- tokenizer,
185
- include_gguf=True,
186
- include_merged=True,
187
- publish=True,
188
- ):
189
- try:
190
- token = os.getenv("HF_TOKEN") or None
191
- model_name = os.getenv("MODEL_NAME")
192
-
193
- save_method = "lora"
194
- quantization_method = "q5_k_m"
195
-
196
- model_names = get_model_names(
197
- model_name, save_method=save_method, quantization_method=quantization_method
198
- )
199
-
200
- model.save_pretrained(model_names["local"])
201
- tokenizer.save_pretrained(model_names["local"])
202
-
203
- if publish:
204
- model.push_to_hub(
205
- model_names["hub"],
206
- token=token,
207
- )
208
- tokenizer.push_to_hub(
209
- model_names["hub"],
210
- token=token,
211
- )
212
-
213
- if include_merged:
214
- model.save_pretrained_merged(
215
- model_names["local"] + "-merged", tokenizer, save_method=save_method
216
- )
217
- if publish:
218
- model.push_to_hub_merged(
219
- model_names["hub"] + "-merged",
220
- tokenizer,
221
- save_method="lora",
222
- token="",
223
- )
224
-
225
- if include_gguf:
226
- model.save_pretrained_gguf(
227
- model_names["local-gguf"],
228
- tokenizer,
229
- quantization_method=quantization_method,
230
- )
231
-
232
- if publish:
233
- model.push_to_hub_gguf(
234
- model_names["hub-gguf"],
235
- tokenizer,
236
- quantization_method=quantization_method,
237
- token=token,
238
- )
239
- except Exception as e:
240
- print(e)
241
-
242
-
243
- def get_metrics(df):
244
- metrics_df = pd.DataFrame(df.columns.T)[2:]
245
- metrics_df.rename(columns={0: "model"}, inplace=True)
246
- metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/")[-1])
247
- metrics_df.reset_index(inplace=True)
248
- metrics_df = metrics_df.drop(columns=["index"])
249
-
250
- accuracy = []
251
- meteor = []
252
- bleu_1 = []
253
- rouge_l = []
254
- all_metrics = []
255
- for col in df.columns[2:]:
256
- metrics = calc_metrics(df["english"], df[col], debug=True)
257
- print(f"{col}: {metrics}")
258
-
259
- accuracy.append(metrics["accuracy"])
260
- meteor.append(metrics["meteor"])
261
- bleu_1.append(metrics["bleu_scores"]["bleu"])
262
- rouge_l.append(metrics["rouge_scores"]["rougeL"])
263
- all_metrics.append(metrics)
264
-
265
- metrics_df["accuracy"] = accuracy
266
- metrics_df["meteor"] = meteor
267
- metrics_df["bleu_1"] = bleu_1
268
- metrics_df["rouge_l"] = rouge_l
269
- metrics_df["all_metrics"] = all_metrics
270
-
271
- return metrics_df
272
-
273
-
274
- def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
275
- plt.figure(figsize=figsize)
276
- df_melted = pd.melt(
277
- metrics_df, id_vars="model", value_vars=["meteor", "bleu_1", "rouge_l"]
278
- )
279
-
280
- barplot = sns.barplot(x="variable", y="value", hue="model", data=df_melted)
281
-
282
- # Set different hatches for each model
283
- hatches = ["/", "\\", "|", "-", "+", "x", "o", "O", ".", "*", "//", "\\\\"]
284
-
285
- # Create a dictionary to map models to hatches
286
- model_hatches = {
287
- model: hatches[i % len(hatches)]
288
- for i, model in enumerate(metrics_df["model"].unique())
289
- }
290
-
291
- # Apply hatches based on the model
292
- num_vars = len(df_melted["variable"].unique())
293
- for i, bar in enumerate(barplot.patches):
294
- model = df_melted["model"].iloc[i // num_vars]
295
- bar.set_hatch(model_hatches[model])
296
-
297
- # Manually update legend to match the bar hatches
298
- handles, labels = barplot.get_legend_handles_labels()
299
- for handle, model in zip(handles, metrics_df["model"].unique()):
300
- handle.set_hatch(model_hatches[model])
301
-
302
- barplot.set_xticklabels(["METEOR", "BLEU-1", "ROUGE-L"])
303
- for p in barplot.patches:
304
- if p.get_height() == 0:
305
- continue
306
- barplot.annotate(
307
- f"{p.get_height():.2f}",
308
- (p.get_x() + p.get_width() / 2.0, p.get_height()),
309
- ha="center",
310
- va="center",
311
- xytext=(0, 10),
312
- textcoords="offset points",
313
- )
314
-
315
- barplot.set(ylim=ylim, ylabel="Scores", xlabel="Metrics")
316
- plt.legend(bbox_to_anchor=(0.5, -0.1), loc="upper center")
317
- plt.show()
318
-
319
-
320
- def plot_times(perf_df, ylim=0.421):
321
- # Adjusted code to put "train-time" bars in red at the bottom
322
-
323
- fig, ax1 = plt.subplots(figsize=(12, 10))
324
-
325
- color_train = "tab:red"
326
- color_eval = "orange"
327
- ax1.set_xlabel("Models")
328
- ax1.set_ylabel("Time (mins)")
329
- ax1.set_xticks(range(len(perf_df["model"]))) # Set x-ticks positions
330
- ax1.set_xticklabels(perf_df["model"], rotation=90)
331
-
332
- # Plot "train-time" first so it's at the bottom
333
- ax1.bar(
334
- perf_df["model"],
335
- perf_df["train-time(mins)"],
336
- color=color_train,
337
- label="train-time",
338
- )
339
-
340
- # Then, plot "eval-time" on top of "train-time"
341
- ax1.bar(
342
- perf_df["model"],
343
- perf_df["eval-time(mins)"],
344
- bottom=perf_df["train-time(mins)"],
345
- color=color_eval,
346
- label="eval-time",
347
- )
348
-
349
- ax1.tick_params(axis="y")
350
- ax1.legend(loc="upper left")
351
-
352
- if "meteor" in perf_df.columns:
353
- ax2 = ax1.twinx()
354
- color_meteor = "tab:blue"
355
- ax2.set_ylabel("METEOR", color=color_meteor)
356
- ax2.plot(
357
- perf_df["model"],
358
- perf_df["meteor"],
359
- color=color_meteor,
360
- marker="o",
361
- label="meteor",
362
- )
363
- ax2.tick_params(axis="y", labelcolor=color_meteor)
364
- ax2.legend(loc="upper right")
365
- ax2.set_ylim(ax2.get_ylim()[0], ylim)
366
-
367
- # Show numbers in bars
368
- for p in ax1.patches:
369
- height = p.get_height()
370
- if height == 0: # Skip bars with height 0
371
- continue
372
- ax1.annotate(
373
- f"{height:.2f}",
374
- (p.get_x() + p.get_width() / 2.0, p.get_y() + height),
375
- ha="center",
376
- va="center",
377
- xytext=(0, -10),
378
- textcoords="offset points",
379
- )
380
-
381
- fig.tight_layout()
382
- plt.show()
383
-
384
-
385
- def translate_via_llm(text):
386
- base_url = os.getenv("OPENAI_BASE_URL") or "http://localhost:8000/v1"
387
- llm = ChatOpenAI(
388
- model="gpt-4o",
389
- temperature=0,
390
- max_tokens=None,
391
- timeout=None,
392
- max_retries=2,
393
- base_url=base_url,
394
- )
395
-
396
- prompt = ChatPromptTemplate.from_messages(
397
- [
398
- (
399
- "human",
400
- "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{input}",
401
- ),
402
- ]
403
- )
404
-
405
- chain = prompt | llm
406
- response = chain.invoke(
407
- {
408
- "input": text,
409
- }
410
- )
411
- return response.content
412
-
413
-
414
- def translate(text, cache_dict):
415
- if text in cache_dict:
416
- return cache_dict[text]
417
- else:
418
- translated_text = translate_via_llm(text)
419
- cache_dict[text] = translated_text
420
- return translated_text