hendrydong commited on
Commit
e4f3acf
1 Parent(s): 176f39e

Upload 32 files

Browse files
lmflow/.DS_Store ADDED
Binary file (10.2 kB). View file
 
lmflow/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .version import __version__ as internal_version
2
+
3
+ __version__ = internal_version
4
+
5
+ from transformers.utils import check_min_version
6
+ from transformers.utils.versions import require_version
7
+
8
+ from lmflow import args, datasets, models, pipeline, utils
9
+
10
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
11
+ check_min_version("4.27.0.dev0")
12
+
13
+ require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
lmflow/args.py ADDED
@@ -0,0 +1,622 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """This script defines dataclasses: ModelArguments and DatasetArguments,
4
+ that contain the arguments for the model and dataset used in training.
5
+
6
+ It imports several modules, including dataclasses, field from typing, Optional from typing,
7
+ require_version from transformers.utils.versions, MODEL_FOR_CAUSAL_LM_MAPPING,
8
+ and TrainingArguments from transformers.
9
+
10
+ MODEL_CONFIG_CLASSES is assigned a list of the model config classes from
11
+ MODEL_FOR_CAUSAL_LM_MAPPING. MODEL_TYPES is assigned a tuple of the model types
12
+ extracted from the MODEL_CONFIG_CLASSES.
13
+ """
14
+
15
+ from dataclasses import dataclass, field
16
+ from typing import Optional, List
17
+
18
+ from transformers.utils.versions import require_version
19
+
20
+ from transformers import (
21
+ MODEL_FOR_CAUSAL_LM_MAPPING,
22
+ TrainingArguments,
23
+ )
24
+
25
+ MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
26
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
27
+
28
+
29
+ @dataclass
30
+ class ModelArguments:
31
+ """
32
+ Define a class ModelArguments using the dataclass decorator.
33
+ The class contains several optional parameters that can be used to configure a model.
34
+
35
+ model_name_or_path : str
36
+ a string representing the path or name of a pretrained
37
+ model checkpoint for weights initialization. If None, a model will be trained from scratch.
38
+
39
+ model_type : str
40
+ a string representing the type of model to use if training from
41
+ scratch. If not provided, a pretrained model will be used.
42
+
43
+ config_overrides : str
44
+ a string representing the default config settings to override
45
+ when training a model from scratch.
46
+
47
+ config_name : str
48
+ a string representing the name or path of the pretrained config to
49
+ use, if different from the model_name_or_path.
50
+
51
+ tokenizer_name : str
52
+ a string representing the name or path of the pretrained tokenizer
53
+ to use, if different from the model_name_or_path.
54
+
55
+ cache_dir : str
56
+ a string representing the path to the directory where pretrained models
57
+ downloaded from huggingface.co will be stored.
58
+
59
+ use_fast_tokenizer : bool
60
+ a boolean indicating whether to use a fast tokenizer (backed by the
61
+ tokenizers library) or not.
62
+
63
+ model_revision : str
64
+ a string representing the specific model version to use (can be a
65
+ branch name, tag name, or commit id).
66
+
67
+ use_auth_token : bool
68
+ a boolean indicating whether to use the token generated when running
69
+ huggingface-cli login (necessary to use this script with private models).
70
+
71
+ torch_dtype : str
72
+ a string representing the dtype to load the model under. If auto is
73
+ passed, the dtype will be automatically derived from the model's weights.
74
+
75
+ use_ram_optimized_load : bool
76
+ a boolean indicating whether to use disk mapping when memory is not
77
+ enough.
78
+ """
79
+
80
+ model_name_or_path: Optional[str] = field(
81
+ default=None,
82
+ metadata={
83
+ "help": (
84
+ "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
85
+ )
86
+ },
87
+ )
88
+ lora_model_path: Optional[str] = field(
89
+ default=None,
90
+ metadata={
91
+ "help": (
92
+ "The incremental model diff introduced by LoRA finetuning."
93
+ " Along with the original non-finetuned model forms the whole"
94
+ " finetuned model."
95
+ )
96
+ }
97
+ )
98
+ model_type: Optional[str] = field(
99
+ default=None,
100
+ metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
101
+ )
102
+ arch_type: Optional[str] = field(
103
+ default="decoder_only",
104
+ metadata={"help": "The architecture type of the model. Currently supported decoder_only or encoder_decoder"}
105
+ )
106
+ config_overrides: Optional[str] = field(
107
+ default=None,
108
+ metadata={
109
+ "help": (
110
+ "Override some existing default config settings when a model is trained from scratch. Example: "
111
+ "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
112
+ )
113
+ },
114
+ )
115
+ arch_type: Optional[str] = field(
116
+ default="decoder_only",
117
+ metadata={
118
+ "help": (
119
+ "Model architecture type, e.g. \"decoder_only\","
120
+ " \"encoder_decoder\""
121
+ ),
122
+ "choices": ["decoder_only", "encoder_decoder", "text_regression"],
123
+ },
124
+ )
125
+ config_name: Optional[str] = field(
126
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
127
+ )
128
+ tokenizer_name: Optional[str] = field(
129
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
130
+ )
131
+ cache_dir: Optional[str] = field(
132
+ default=None,
133
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
134
+ )
135
+ use_fast_tokenizer: bool = field(
136
+ default=True,
137
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
138
+ )
139
+ model_revision: str = field(
140
+ default="main",
141
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
142
+ )
143
+ use_auth_token: bool = field(
144
+ default=False,
145
+ metadata={
146
+ "help": (
147
+ "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
148
+ "with private models)."
149
+ )
150
+ },
151
+ )
152
+ torch_dtype: Optional[str] = field(
153
+ default=None,
154
+ metadata={
155
+ "help": (
156
+ "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
157
+ "dtype will be automatically derived from the model's weights."
158
+ ),
159
+ "choices": ["auto", "bfloat16", "float16", "float32"],
160
+ },
161
+ )
162
+ use_lora: bool = field(
163
+ default=False,
164
+ metadata={"help": "Whether to lora."},
165
+ )
166
+ lora_r: int = field(
167
+ default=8,
168
+ metadata={"help": "the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has."},
169
+ )
170
+ lora_alpha: int = field(
171
+ default=32,
172
+ metadata={"help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."},
173
+ )
174
+ lora_target_modules: List[str] = field(
175
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name",
176
+ }
177
+ )
178
+ lora_dropout: float = field(
179
+ default=0.1,
180
+ metadata={"help": "The dropout rate in lora.linear."},
181
+ )
182
+ save_aggregated_lora: bool = field(
183
+ default=False,
184
+ metadata={"help": "Whether to save aggregated lora."},
185
+ )
186
+ use_ram_optimized_load: bool = field(
187
+ default=True,
188
+ metadata={"help": "Whether use disk mapping when memory is not enough."}
189
+ )
190
+
191
+ def __post_init__(self):
192
+ if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
193
+ raise ValueError(
194
+ "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
195
+ )
196
+
197
+
198
+ @dataclass
199
+ class DatasetArguments:
200
+ """
201
+ Define a class DatasetArguments using the dataclass decorator.
202
+ The class contains several optional parameters that can be used to configure a dataset for a language model.
203
+
204
+
205
+ dataset_path : str
206
+ a string representing the path of the dataset to use.
207
+
208
+ dataset_name : str
209
+ a string representing the name of the dataset to use. The default value is "customized".
210
+
211
+ is_custom_dataset : bool
212
+ a boolean indicating whether to use custom data. The default value is False.
213
+
214
+ customized_cache_dir : str
215
+ a string representing the path to the directory where customized dataset caches will be stored.
216
+
217
+ dataset_config_name : str
218
+ a string representing the configuration name of the dataset to use (via the datasets library).
219
+
220
+ train_file : str
221
+ a string representing the path to the input training data file (a text file).
222
+
223
+ validation_file : str
224
+ a string representing the path to the input evaluation data file to evaluate the perplexity on (a text file).
225
+
226
+ max_train_samples : int
227
+ an integer indicating the maximum number of training examples to use for debugging or quicker training.
228
+ If set, the training dataset will be truncated to this number.
229
+
230
+ max_eval_samples: int
231
+ an integer indicating the maximum number of evaluation examples to use for debugging or quicker training.
232
+ If set, the evaluation dataset will be truncated to this number.
233
+
234
+ streaming : bool
235
+ a boolean indicating whether to enable streaming mode.
236
+
237
+ block_size: int
238
+ an integer indicating the optional input sequence length after tokenization. The training dataset will be
239
+ truncated in blocks of this size for training.
240
+
241
+ The class also includes some additional parameters that can be used to configure the dataset further, such as `overwrite_cache`,
242
+ `validation_split_percentage`, `preprocessing_num_workers`, `disable_group_texts`, `demo_example_in_prompt`, `explanation_in_prompt`,
243
+ `keep_linebreaks`, and `prompt_structure`.
244
+
245
+ The field function is used to set default values and provide help messages for each parameter. The Optional type hint is
246
+ used to indicate that a parameter is optional. The metadata argument is used to provide additional information about
247
+ each parameter, such as a help message.
248
+ """
249
+
250
+ dataset_path: Optional[str] = field(
251
+ default=None, metadata={"help": "The path of the dataset to use."}
252
+ )
253
+ dataset_name: Optional[str] = field(
254
+ default="customized", metadata={"help": "Should be \"customized\""}
255
+ )
256
+ is_custom_dataset: Optional[bool] = field(
257
+ default=False, metadata={"help": "whether to use custom data"}
258
+ )
259
+ customized_cache_dir: Optional[str] = field(
260
+ default=".cache/llm-ft/datasets",
261
+ metadata={"help": "Where do you want to store the customized dataset caches"},
262
+ )
263
+ dataset_config_name: Optional[str] = field(
264
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
265
+ )
266
+ train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
267
+ validation_file: Optional[str] = field(
268
+ default=None,
269
+ metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
270
+ )
271
+ max_train_samples: Optional[int] = field(
272
+ default=None,
273
+ metadata={
274
+ "help": (
275
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
276
+ "value if set."
277
+ )
278
+ },
279
+ )
280
+ max_eval_samples: Optional[int] = field(
281
+ default=1e10,
282
+ metadata={
283
+ "help": (
284
+ "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
285
+ "value if set."
286
+ )
287
+ },
288
+ )
289
+ streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
290
+ block_size: Optional[int] = field(
291
+ default=None,
292
+ metadata={
293
+ "help": (
294
+ "Optional input sequence length after tokenization. "
295
+ "The training dataset will be truncated in block of this size for training. "
296
+ "Default to the model max input length for single sentence inputs (take into account special tokens)."
297
+ )
298
+ },
299
+ )
300
+ overwrite_cache: bool = field(
301
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
302
+ )
303
+ validation_split_percentage: Optional[int] = field(
304
+ default=5,
305
+ metadata={
306
+ "help": "The percentage of the train set used as validation set in case there's no validation split"
307
+ },
308
+ )
309
+ preprocessing_num_workers: Optional[int] = field(
310
+ default=None,
311
+ metadata={"help": "The number of processes to use for the preprocessing."},
312
+ )
313
+ disable_group_texts: bool = field(
314
+ default=False,
315
+ metadata={
316
+ "help": (
317
+ "Whether we group original samples together to generate sample"
318
+ " sequences of length `block_size`. By default, we group every"
319
+ " 1000 tokenized sequences together, divide them into "
320
+ " [{total_num_tokens} / {block_size}] sequences, each with"
321
+ " `block_size` tokens (the remaining tokens are ommited."
322
+ " If this flag is set to True, we only group 1 tokenized"
323
+ " sequence, i.e. cutting long sequence into chunks."
324
+ )
325
+ },
326
+ )
327
+ keep_linebreaks: bool = field(
328
+ default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
329
+ )
330
+ test_file: Optional[str] = field(
331
+ default=None,
332
+ metadata={"help": "Evaluation File Path"},
333
+ )
334
+
335
+ def __post_init__(self):
336
+ if self.streaming:
337
+ require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
338
+
339
+ if self.dataset_name is None and self.train_file is None and self.validation_file is None:
340
+ raise ValueError("Need either a dataset name or a training/validation file.")
341
+ else:
342
+ if self.train_file is not None:
343
+ extension = self.train_file.split(".")[-1]
344
+ assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
345
+ if self.validation_file is not None:
346
+ extension = self.validation_file.split(".")[-1]
347
+ assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
348
+
349
+
350
+ @dataclass
351
+ class FinetunerArguments(TrainingArguments):
352
+ """
353
+ Adapt transformers.TrainingArguments
354
+ """
355
+ pass
356
+
357
+
358
+ @dataclass
359
+ class EvaluatorArguments:
360
+ """
361
+ Define a class EvaluatorArguments using the dataclass decorator. The class contains several optional
362
+ parameters that can be used to configure a evaluator.
363
+
364
+ local_rank : str
365
+ For distributed training: local_rank
366
+
367
+ random_shuffle : bool
368
+
369
+ use_wandb : bool
370
+
371
+ random_seed : int, default = 1
372
+
373
+ output_dir : str, default = './output_dir',
374
+
375
+ mixed_precision : str, choice from ["bf16","fp16"].
376
+ mixed precision mode, whether to use bf16 or fp16
377
+
378
+ deepspeed :
379
+ Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already
380
+ loaded json file as a dict
381
+ """
382
+ local_rank: int = field(
383
+ default=-1,
384
+ metadata={"help": "For distributed training: local_rank"
385
+ }
386
+ )
387
+
388
+ random_shuffle: Optional[bool] = field(
389
+ default=False,
390
+ metadata={"help": ""
391
+ }
392
+ )
393
+
394
+ use_wandb: Optional[bool] = field(
395
+ default=False,
396
+ metadata={
397
+ "help": (
398
+ "When this flag is True, wandb will be enabled"
399
+ )
400
+ },
401
+ )
402
+ random_seed: Optional[int] = field(
403
+ default=1,
404
+ metadata={
405
+ "help": (
406
+ "used to set random seed"
407
+ )
408
+ },
409
+ )
410
+ output_dir: Optional[str] = field(
411
+ default="./output_dir",
412
+ metadata={"help": "Output path for the inferenced results"},
413
+ )
414
+ mixed_precision: Optional[str] = field(
415
+ default="bf16",
416
+ metadata={
417
+ "help": (
418
+ "mixed precision mode, whether to use bf16 or fp16"
419
+ ),
420
+ "choices": ["bf16","fp16"],
421
+ },
422
+ )
423
+ deepspeed: Optional[str] = field(
424
+ default=None,
425
+ metadata={
426
+ "help": (
427
+ "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
428
+ " loaded json file as a dict"
429
+ )
430
+ },
431
+ )
432
+ answer_type: Optional[str] = field(
433
+ default="text",
434
+ metadata={
435
+ "help": (
436
+ 'Question type for answer extraction from the decoder output.'
437
+ ' Supported types: \n'
438
+ ' 1) "multiple_choice", e.g. A, B, C, D, ...\n'
439
+ ' 2) "binary_choice", e.g. yes, no, maybe\n'
440
+ ' 3) "math", e.g. 1.0, -3.52\n'
441
+ ' 4) "text", e.g. "I think that it is okay"\n'
442
+ ' 5) Special treatment for several datasets\n'
443
+ ' - "gsm8k"\n'
444
+ ' - "svamp"\n'
445
+ ' - "asdiv"\n'
446
+ ' - "addsub"\n'
447
+ ' - "singleeq"\n'
448
+ ' - "multiarith"\n'
449
+ ' - "aqua"\n'
450
+ ' - "csqa"\n'
451
+ ' - "strategyqa"\n'
452
+ ' - "pubmedqa"\n'
453
+ ' - "medmcqa"\n'
454
+ ' - "usmle"\n'
455
+ )
456
+ },
457
+ )
458
+ prompt_structure: Optional[str] = field(
459
+ default="{input}",
460
+ metadata={
461
+ "help": (
462
+ 'Prompt structure to facilitate prompt engineering during'
463
+ ' inference. The model will receive'
464
+ ' `prompt_structure.format(input=input)` as its input.'
465
+ )
466
+ },
467
+ )
468
+ evaluate_block_size: Optional[int] = field(
469
+ default=512,
470
+ metadata={
471
+ "help": (
472
+ "the model will have at least block_size tokens for context when calculating the conditional likelihood of any one token"
473
+ " (provided there are block_size preceding tokens available to condition on)"
474
+ )
475
+ },
476
+ )
477
+ metric: Optional[str] = field(
478
+ default="accuracy",
479
+ metadata={
480
+ "help": "the metric the model will be evaluated on",
481
+ "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood"],
482
+ },
483
+ )
484
+
485
+
486
+ @dataclass
487
+ class InferencerArguments:
488
+ """
489
+ Define a class InferencerArguments using the dataclass decorator. The class contains several optional
490
+ parameters that can be used to configure a inferencer.
491
+
492
+ local_rank : str
493
+ For distributed training: local_rank
494
+
495
+ random_seed : int, default = 1
496
+
497
+ deepspeed :
498
+ Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already
499
+ loaded json file as a dict
500
+ mixed_precision : str, choice from ["bf16","fp16"].
501
+ mixed precision mode, whether to use bf16 or fp16
502
+
503
+ """
504
+ device: str = field(
505
+ default="gpu",
506
+ metadata={
507
+ "help": "device of chatbot",
508
+ "choices": ["gpu", "cpu"],
509
+ },
510
+ )
511
+ local_rank: int = field(
512
+ default=-1,
513
+ metadata={"help": "For distributed training: local_rank"
514
+ }
515
+ )
516
+ random_seed: Optional[int] = field(
517
+ default=1,
518
+ metadata={
519
+ "help": (
520
+ "used to set random seed"
521
+ )
522
+ },
523
+ )
524
+ deepspeed: Optional[str] = field(
525
+ default=None,
526
+ metadata={
527
+ "help": (
528
+ "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
529
+ " loaded json file as a dict"
530
+ )
531
+ },
532
+ )
533
+ mixed_precision: Optional[str] = field(
534
+ default="bf16",
535
+ metadata={
536
+ "help": (
537
+ "mixed precision mode, whether to use bf16 or fp16"
538
+ ),
539
+ "choices": ["bf16","fp16"],
540
+ },
541
+ )
542
+
543
+
544
+ @dataclass
545
+ class RaftAlignerArguments(TrainingArguments):
546
+ """
547
+ Define a class RaftAlignerArguments to configure raft aligner.
548
+ """
549
+ output_reward_path: Optional[str] = field(
550
+ default="tmp/raft_aligner/",
551
+ metadata={
552
+ "help": "The path of output rewards."
553
+ }
554
+ )
555
+ output_min_length: Optional[int] = field(
556
+ default=16,
557
+ metadata={
558
+ "help": (
559
+ "minimum length of the output token sequence generated from"
560
+ " model given an input."
561
+ ),
562
+ },
563
+ )
564
+ output_max_length: Optional[int] = field(
565
+ default=48,
566
+ metadata={
567
+ "help": (
568
+ "maximum length of the output token sequence generated from"
569
+ " model given an output."
570
+ ),
571
+ },
572
+ )
573
+ num_raft_iteration: Optional[int] = field(
574
+ default=20,
575
+ metadata={
576
+ "help": "number of iterations of the raft aligner."
577
+ },
578
+ )
579
+ raft_batch_size: Optional[int] = field(
580
+ default=320,
581
+ metadata={
582
+ "help": (
583
+ "only select {raft_batch_size} samples each time to"
584
+ " generate rewards and be ranked for STF training."
585
+ )
586
+ },
587
+ )
588
+ top_reward_percentage: Optional[int] = field(
589
+ default=0.2,
590
+ metadata={
591
+ "help": (
592
+ "only top {top_reward_percentage} samples in the raft batch,"
593
+ " (in terms of rewards), will be used for SFT the model."
594
+ ),
595
+ },
596
+ )
597
+ inference_batch_size_per_device: Optional[int] = field(
598
+ default=1,
599
+ metadata={
600
+ "help": (
601
+ "every device will infer {inference_batch_size_per_device}"
602
+ " samples in parallel. The inferred results will be concatenaed"
603
+ " with inputs and attach a reward."
604
+ ),
605
+ },
606
+ )
607
+
608
+
609
+ PIPELINE_ARGUMENT_MAPPING = {
610
+ "finetuner": FinetunerArguments,
611
+ "evaluator": EvaluatorArguments,
612
+ "inferencer": InferencerArguments,
613
+ "raft_aligner": RaftAlignerArguments,
614
+ }
615
+
616
+
617
+ class AutoArguments:
618
+ """
619
+ Automatically choose arguments from FinetunerArguments or EvaluatorArguments.
620
+ """
621
+ def get_pipeline_args_class(pipeline_name: str):
622
+ return PIPELINE_ARGUMENT_MAPPING[pipeline_name]
lmflow/datasets/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """This Python code defines a class Dataset with methods for initializing, loading,
2
+ and manipulating datasets from different backends such as Hugging Face and JSON.
3
+
4
+ The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging
5
+ Face dataset, mapping datasets, and retrieving the backend dataset and arguments.
6
+ """
7
+ from lmflow.datasets.dataset import Dataset
lmflow/datasets/dataset.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """This Python code defines a class Dataset with methods for initializing, loading,
4
+ and manipulating datasets from different backends such as Hugging Face and JSON.
5
+
6
+ The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging
7
+ Face dataset, mapping datasets, and retrieving the backend dataset and arguments.
8
+ """
9
+
10
+
11
+
12
+ # Importing necessary libraries and modules
13
+ import json
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ from datasets import load_dataset
18
+ from datasets import Dataset as HFDataset
19
+
20
+ from lmflow.args import DatasetArguments
21
+
22
+ DATASET_TYPES = [
23
+ "text_only",
24
+ "text2text",
25
+ ]
26
+
27
+ KEY_TYPE = "type"
28
+ KEY_INSTANCES = "instances"
29
+
30
+ class Dataset:
31
+ r"""
32
+ Initializes the Dataset object with the given parameters.
33
+
34
+ Parameters
35
+ ------------
36
+ data_args : DatasetArguments object.
37
+ Contains the arguments required to load the dataset.
38
+
39
+ backend : str, default="huggingface"
40
+ A string representing the dataset backend. Defaults to "huggingface".
41
+
42
+ args : Optional.
43
+ Positional arguments.
44
+
45
+ kwargs : Optional.
46
+ Keyword arguments.
47
+ """
48
+ def __init__(self, data_args=None, backend: str="huggingface", *args, **kwargs):
49
+ self.data_args = data_args
50
+ self.backend = backend
51
+ self.backend_dataset = None
52
+ self.type = None # Original type of the dataset
53
+ self.dataset_path = data_args.dataset_path
54
+
55
+ if data_args.dataset_path is None:
56
+ return
57
+
58
+ if backend == "huggingface":
59
+ data_files = [
60
+ x.absolute().as_posix()
61
+ for x in Path(self.dataset_path).glob("*.json")
62
+ ]
63
+
64
+ # Iterate through all the files and ensure they have the same data type
65
+ for single_file in data_files:
66
+ with open(single_file) as fin:
67
+ json_data = json.load(fin)
68
+ if KEY_TYPE not in json_data.keys():
69
+ raise ValueError(
70
+ f'"{KEY_TYPE}" field must be specified for data, e.g.'
71
+ '{\n'
72
+ f' "{KEY_TYPE}: "text_only",\n'
73
+ f' "{KEY_INSTANCES}": [\n'
74
+ ' { "text": "Sentence 1: This is a sentence." }\n'
75
+ ' { "text": "Sentence 2: This is another sentence." }\n'
76
+ f' ]\n'
77
+ '}'
78
+ )
79
+
80
+ if self.type is None:
81
+ self.type = json_data[KEY_TYPE]
82
+ elif self.type != json_data[KEY_TYPE]:
83
+ raise ValueError(
84
+ 'All task files must have same data types. Previous'
85
+ f' files have type "{self.type}", but in file'
86
+ f' {single_file}, it has type "{self.type}".'
87
+ )
88
+
89
+ # Load the dataset using the HuggingFace dataset library
90
+ extensions = "json"
91
+ raw_dataset = load_dataset(
92
+ extensions,
93
+ data_files=data_files,
94
+ field=KEY_INSTANCES,
95
+ split="train",
96
+ use_auth_token=None,
97
+ )
98
+ self.backend_dataset = raw_dataset
99
+ elif backend == "json":
100
+ # TODO (@Jiachun)
101
+ pass
102
+ else:
103
+ raise NotImplementedError(f'Unsupported dataset backend "{backend}"')
104
+
105
+
106
+ def _check_data_type(self):
107
+ # TODO: check if data type and data structure matches, raise messages
108
+ # with hints
109
+ pass
110
+
111
+
112
+ def from_dict(self, dict_obj: dict, *args, **kwargs):
113
+ r"""
114
+ Create a Dataset object from a dictionary.
115
+
116
+ Return a Dataset given a dict with format:
117
+ {
118
+ "type": TYPE,
119
+ "instances": [
120
+ {
121
+ "key_1": VALUE_1.1,
122
+ "key_2": VALUE_1.2,
123
+ ...
124
+ },
125
+ {
126
+ "key_1": VALUE_2.1,
127
+ "key_2": VALUE_2.2,
128
+ ...
129
+ },
130
+ ...
131
+ ]
132
+ }
133
+
134
+ Parameters
135
+ -----------
136
+
137
+ dict_obj : dict.
138
+ A dictionary containing the dataset information.
139
+
140
+ args : Optional.
141
+ Positional arguments.
142
+
143
+ kwargs : Optional.
144
+ Keyword arguments.
145
+
146
+ Returns
147
+ ---------
148
+
149
+ self : Dataset object.
150
+ """
151
+ if self.backend == "huggingface":
152
+ if KEY_TYPE not in dict_obj:
153
+ raise ValueError(
154
+ f'"{KEY_TYPE}" must be provided to initialize a dataset'
155
+ )
156
+ if KEY_INSTANCES not in dict_obj:
157
+ raise ValueError(
158
+ f'"{KEY_INSTANCES}" must be provided to initialize a dataset'
159
+ )
160
+
161
+ self.type = dict_obj[KEY_TYPE]
162
+
163
+ hf_dict = {}
164
+ if len(dict_obj[KEY_INSTANCES]) > 0:
165
+ for key in dict_obj[KEY_INSTANCES][0].keys():
166
+ hf_dict[key] = [ instance[key] for instance in dict_obj[KEY_INSTANCES] ]
167
+
168
+ self.backend_dataset = HFDataset.from_dict(hf_dict, *args, **kwargs)
169
+ return self
170
+ else:
171
+ raise NotImplementedError(
172
+ f'Currently .from_dict is not supported for backend "{backend}"'
173
+ )
174
+
175
+
176
+ @classmethod
177
+ def create_from_dict(cls, dict_obj, *args, **kwargs):
178
+ r"""
179
+ Returns
180
+ --------
181
+
182
+ Returns a Dataset object given a dict.
183
+ """
184
+ empty_data_args = DatasetArguments(dataset_path=None)
185
+ dataset = Dataset(empty_data_args)
186
+ return dataset.from_dict(dict_obj)
187
+
188
+
189
+ def to_dict(self):
190
+ r"""
191
+ Returns
192
+ ---------
193
+
194
+ Return a dict represents the dataset:
195
+ {
196
+ "type": TYPE,
197
+ "instances": [
198
+ {
199
+ "key_1": VALUE_1.1,
200
+ "key_2": VALUE_1.2,
201
+ ...
202
+ },
203
+ {
204
+ "key_1": VALUE_2.1,
205
+ "key_2": VALUE_2.2,
206
+ ...
207
+ },
208
+ ...
209
+ ]
210
+ }
211
+
212
+ A python dict object represents the content of this dataset.
213
+ """
214
+ if self.backend == "huggingface":
215
+ dict_obj = {}
216
+ dict_obj[KEY_TYPE] = self.get_type()
217
+
218
+ hf_dict = self.backend_dataset.to_dict()
219
+ dict_obj[KEY_INSTANCES] = []
220
+
221
+ first_key = None
222
+ for key in hf_dict.keys():
223
+ first_key = key
224
+ break
225
+
226
+ if first_key is not None:
227
+ num_instances = len(hf_dict[first_key])
228
+ dict_obj[KEY_INSTANCES] = [
229
+ {
230
+ key: hf_dict[key][i] for key in hf_dict.keys()
231
+ }
232
+ for i in range(num_instances)
233
+ ]
234
+
235
+ return dict_obj
236
+ else:
237
+ raise NotImplementedError(
238
+ f'Current .to_dict is not supported for backend "{backend}"'
239
+ )
240
+
241
+
242
+ def map(self, *args, **kwargs):
243
+ r"""
244
+ Parameters
245
+ ------------
246
+ args : Optional.
247
+ Positional arguments.
248
+
249
+ kwargs : Optional.
250
+ Keyword arguments.
251
+
252
+ Returns
253
+ ---------
254
+
255
+ self : Dataset object.
256
+ """
257
+ # If the dataset uses Hugging Face as the backend,
258
+ # call the `map()` function of the Hugging Face backend dataset
259
+ if self.backend == "huggingface":
260
+ # Set the mapped dataset as the backend dataset of the current dataset
261
+ mapped_backend_dataset = self.backend_dataset.map(*args, **kwargs)
262
+ self.backend_dataset = mapped_backend_dataset
263
+ return self
264
+ else:
265
+ # If the backend is not Hugging Face, raise a NotImplementedError
266
+ raise NotImplementedError(
267
+ f'Currently .map is not supported for backend "{backend}"'
268
+ )
269
+
270
+
271
+ def get_backend(self) -> Optional[str]:
272
+ r"""
273
+ Returns
274
+ ---------
275
+
276
+ self.backend
277
+ """
278
+ return self.backend
279
+
280
+
281
+ def get_backend_dataset(self):
282
+ r"""
283
+ Returns
284
+ ---------
285
+
286
+ self.backend_dataset
287
+ """
288
+ return self.backend_dataset
289
+
290
+
291
+ def get_data_args(self):
292
+ r"""
293
+ Returns
294
+ ---------
295
+
296
+ self.data_args
297
+ """
298
+ return self.data_args
299
+
300
+
301
+ def get_type(self):
302
+ r"""
303
+ Returns
304
+ ---------
305
+
306
+ self.type
307
+ """
308
+ return self.type
lmflow/models/__init__.py ADDED
File without changes
lmflow/models/auto_model.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """Automatically get correct model type.
4
+ """
5
+
6
+ from lmflow.models.hf_decoder_model import HFDecoderModel
7
+ from lmflow.models.text_regression_model import TextRegressionModel
8
+ from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
9
+
10
+ class AutoModel:
11
+
12
+ @classmethod
13
+ def get_model(self, model_args, *args, **kwargs):
14
+ arch_type = model_args.arch_type
15
+ if arch_type == "decoder_only":
16
+ return HFDecoderModel(model_args, *args, **kwargs)
17
+ elif arch_type == "text_regression":
18
+ return TextRegressionModel(model_args, *args, **kwargs)
19
+ elif arch_type == "encoder_decoder":
20
+ return HFEncoderDecoderModel(model_args, *args, **kwargs)
21
+ else:
22
+ raise NotImplementedError(
23
+ f"model architecture type \"{arch_type}\" is not supported"
24
+ )
lmflow/models/base_model.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """Base model class.
4
+ """
5
+
6
+ from abc import ABC
7
+
8
+
9
+ class BaseModel(ABC):
10
+
11
+ def __init__(self, *args, **kwargs):
12
+ pass
lmflow/models/decoder_model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """A one-line summary of the module or program, terminated by a period.
4
+
5
+ Leave one blank line. The rest of this docstring should contain an
6
+ overall description of the module or program. Optionally, it may also
7
+ contain a brief description of exported classes and functions and/or usage
8
+ examples.
9
+
10
+ Typical usage example:
11
+
12
+ foo = ClassFoo()
13
+ bar = foo.FunctionBar()
14
+ """
15
+
16
+ from lmflow.models.base_model import BaseModel
17
+
18
+
19
+ class DecoderModel(BaseModel):
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ pass
lmflow/models/encoder_decoder_model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """A one-line summary of the module or program, terminated by a period.
4
+
5
+ Leave one blank line. The rest of this docstring should contain an
6
+ overall description of the module or program. Optionally, it may also
7
+ contain a brief description of exported classes and functions and/or usage
8
+ examples.
9
+
10
+ Typical usage example:
11
+
12
+ foo = ClassFoo()
13
+ bar = foo.FunctionBar()
14
+ """
15
+
16
+ from lmflow.models.base_model import BaseModel
17
+
18
+
19
+ class EncoderDecoderModel(BaseModel):
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ pass
lmflow/models/hf_decoder_model.py ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """This is a class called HFDecoderModel which is a wrapper around transformers model and
4
+ tokenizer classes. It has several methods such as __init__, tokenize, and train that are
5
+ used for training and fine-tuning the model. The __init__ method takes in several arguments
6
+ such as model_args, tune_strategy, and ds_config, which are used to load the pretrained
7
+ model and tokenizer, and initialize the training settings.
8
+
9
+ The tokenize method is used to tokenize the input text and return the input IDs and attention
10
+ masks that can be fed to the model for training or inference.
11
+
12
+ This class supports different tune_strategy options such as 'normal', 'none', 'lora', and
13
+ 'adapter', which allow for different fine-tuning settings of the model. However, the 'lora'
14
+ and 'adapter' strategies are not yet implemented.
15
+
16
+ Overall, this class provides a convenient interface for loading and fine-tuning transformer
17
+ models and can be used for various NLP tasks such as language modeling, text classification,
18
+ and question answering.
19
+ """
20
+
21
+ import logging
22
+ from typing import List, Union
23
+
24
+ import deepspeed
25
+
26
+ from peft import (
27
+ LoraConfig,
28
+ PeftModel,
29
+ TaskType,
30
+ get_peft_config,
31
+ get_peft_model,
32
+ )
33
+
34
+ import torch
35
+ import transformers
36
+ from transformers.deepspeed import HfDeepSpeedConfig
37
+
38
+ from transformers.testing_utils import CaptureLogger
39
+
40
+ from transformers import (
41
+ CONFIG_MAPPING,
42
+ AutoConfig,
43
+ AutoTokenizer,
44
+ AutoModelForCausalLM,
45
+ )
46
+
47
+ from lmflow.datasets.dataset import Dataset
48
+ from lmflow.models.decoder_model import DecoderModel
49
+ from lmflow.models.interfaces.tunable import Tunable
50
+ from lmflow.utils.constants import (
51
+ TEXT_ONLY_DATASET_DESCRIPTION,
52
+ TEXT2TEXT_DATASET_DESCRIPTION,
53
+ )
54
+
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+
59
+ class HFDecoderModel(DecoderModel, Tunable):
60
+ r"""
61
+ Initializes a HFDecoderModel instance.
62
+
63
+ Parameters
64
+ ------------
65
+
66
+ model_args :
67
+ Model arguments such as model name, path, revision, etc.
68
+
69
+ tune_strategy : str or none, default="normal".
70
+ A string representing the dataset backend. Defaults to "huggingface".
71
+
72
+ ds_config :
73
+ Deepspeed configuations.
74
+
75
+ args : Optional.
76
+ Positional arguments.
77
+
78
+ kwargs : Optional.
79
+ Keyword arguments.
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ model_args,
85
+ tune_strategy='normal',
86
+ ds_config=None,
87
+ device="gpu",
88
+ *args,
89
+ **kwargs
90
+ ):
91
+ """
92
+ Initializes a HFDecoderModel instance.
93
+ :param model_args: dictionary with model arguments such as model name, path, revision, etc.
94
+ :param tune_strategy: tuning strategy: normal, none, lora or adapter
95
+ :param ds_config: deepspeed configuration for distributed training
96
+ """
97
+
98
+ # See more about loading any type of standard or custom dataset (from
99
+ # files, python dict, pandas DataFrame, etc) at
100
+ # https://huggingface.co/docs/datasets/loading_datasets.html.
101
+
102
+ # Load pretrained model and tokenizer
103
+ #
104
+ # Distributed training: The .from_pretrained methods guarantee that
105
+ # only one local process can concurrently download model & vocab.
106
+
107
+ self.device = device
108
+ self.model_args = model_args
109
+ torch_dtype = (
110
+ model_args.torch_dtype
111
+ if model_args.torch_dtype in ["auto", None]
112
+ else getattr(torch, model_args.torch_dtype)
113
+ )
114
+ if tune_strategy == 'normal':
115
+ config_kwargs = {
116
+ "cache_dir": model_args.cache_dir,
117
+ "revision": model_args.model_revision,
118
+ "use_auth_token": True if model_args.use_auth_token else None,
119
+ }
120
+ if model_args.config_name:
121
+ config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
122
+ elif model_args.model_name_or_path:
123
+ config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
124
+ else:
125
+ config = CONFIG_MAPPING[model_args.model_type]()
126
+ logger.warning("You are instantiating a new config instance from scratch.")
127
+ if model_args.config_overrides is not None:
128
+ logger.info(f"Overriding config: {model_args.config_overrides}")
129
+ config.update_from_string(model_args.config_overrides)
130
+ logger.info(f"New config: {config}")
131
+
132
+ tokenizer_kwargs = {
133
+ "cache_dir": model_args.cache_dir,
134
+ "use_fast": model_args.use_fast_tokenizer,
135
+ "revision": model_args.model_revision,
136
+ "use_auth_token": True if model_args.use_auth_token else None,
137
+ }
138
+ if model_args.tokenizer_name:
139
+ tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
140
+ elif model_args.model_name_or_path:
141
+ tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
142
+ else:
143
+ raise ValueError(
144
+ "You are instantiating a new tokenizer from scratch. This is"
145
+ " not supported by this script. You can do it from another"
146
+ " script, save it, and load it from here, using"
147
+ " --tokenizer_name."
148
+ )
149
+
150
+ if model_args.model_name_or_path:
151
+ model = AutoModelForCausalLM.from_pretrained(
152
+ model_args.model_name_or_path,
153
+ from_tf=bool(".ckpt" in model_args.model_name_or_path),
154
+ config=config,
155
+ cache_dir=model_args.cache_dir,
156
+ revision=model_args.model_revision,
157
+ use_auth_token=True if model_args.use_auth_token else None,
158
+ torch_dtype=torch_dtype,
159
+ )
160
+ else:
161
+ model = AutoModelForCausalLM.from_config(config)
162
+ n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
163
+ logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
164
+ self.backend_model_full = model
165
+ if model_args.use_lora:
166
+ if model_args.lora_target_modules:
167
+ lora_target_modules = model_args.lora_target_modules
168
+ else:
169
+ lora_target_modules = None
170
+ peft_config = LoraConfig(
171
+ task_type=TaskType.CAUSAL_LM,
172
+ inference_mode=False,
173
+ r=model_args.lora_r,
174
+ lora_alpha=model_args.lora_alpha,
175
+ lora_dropout=model_args.lora_dropout,
176
+ target_modules=lora_target_modules,
177
+ )
178
+ model = get_peft_model(model, peft_config)
179
+ model.print_trainable_parameters()
180
+
181
+ # We resize the embeddings only when necessary to avoid index errors.
182
+ # If you are creating a model from scratch on a small vocab and want a
183
+ # smaller embedding size, remove this test.
184
+ embedding_size = model.get_input_embeddings().weight.shape[0]
185
+ if len(tokenizer) > embedding_size:
186
+ model.resize_token_embeddings(len(tokenizer))
187
+
188
+ self.config = config
189
+ self.backend_model = model
190
+ self.tokenizer = tokenizer
191
+ self.tune_strategy = tune_strategy
192
+
193
+ elif tune_strategy == 'none':
194
+
195
+ peft_model_id = model_args.lora_model_path
196
+ # NOTE: Currently offload is not supported by llama
197
+ if "llama" in model_args.model_name_or_path and model_args.use_ram_optimized_load:
198
+ logger.warning(
199
+ "llama does not support RAM optimized load. Automatically"
200
+ " use original load instead."
201
+ )
202
+ model_args.use_ram_optimized_load = False
203
+
204
+ if model_args.use_ram_optimized_load and peft_model_id is None:
205
+ try:
206
+ # RAM-optimized load
207
+ self.backend_model = AutoModelForCausalLM.from_pretrained(
208
+ model_args.model_name_or_path,
209
+ device_map="auto",
210
+ offload_folder="offload",
211
+ offload_state_dict=True,
212
+ torch_dtype=torch_dtype,
213
+ )
214
+ except:
215
+ logger.warning(
216
+ "Failed to use RAM optimized load. Automatically"
217
+ " use original load instead."
218
+ )
219
+ # Normal load
220
+ self.backend_model = AutoModelForCausalLM.from_pretrained(
221
+ model_args.model_name_or_path,
222
+ torch_dtype=torch_dtype,
223
+ )
224
+ else:
225
+ if peft_model_id is not None:
226
+ logger.warning(
227
+ "LoRA does not support RAM optimized load currently."
228
+ " Automatically use original load instead."
229
+ )
230
+ self.backend_model = AutoModelForCausalLM.from_pretrained(
231
+ model_args.model_name_or_path,
232
+ torch_dtype=torch_dtype,
233
+ )
234
+
235
+ self.tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
236
+ self.backend_model_full = self.backend_model
237
+ if peft_model_id is not None:
238
+ self.backend_model = PeftModel.from_pretrained(
239
+ self.backend_model, peft_model_id
240
+ )
241
+
242
+ if device == "gpu":
243
+ deepspeed.init_distributed()
244
+ self.ds_engine = deepspeed.initialize(model=self.backend_model, config_params=ds_config)[0]
245
+ self.ds_engine.module.eval()
246
+
247
+ elif tune_strategy == 'adapter':
248
+ raise NotImplementedError('adapter tune strategy not implemented')
249
+
250
+
251
+ def tokenize(self, dataset, add_special_tokens=True, *args, **kwargs):
252
+ """
253
+ Tokenize the full dataset.
254
+
255
+ Parameters
256
+ ------------
257
+ dataset : lmflow.datasets.Dataset.
258
+
259
+ args : Optional.
260
+ Positional arguments.
261
+
262
+ kwargs : Optional.
263
+ Keyword arguments.
264
+
265
+ Returns
266
+ ------------
267
+ tokenized_datasets :
268
+ The tokenized dataset, without any leading or trailing special
269
+ tokens (normally they are Begin-Of-Sentence or End-Of-Sentence
270
+ tokens).
271
+ """
272
+ # Preprocessing the datasets.
273
+ # First we tokenize all the texts.
274
+ if dataset.get_backend() != "huggingface":
275
+ raise NotImplementedError(
276
+ "tokenization of datasets with non-huggingface backend are"
277
+ "not supported yet"
278
+ )
279
+
280
+ dataset_type = dataset.get_type()
281
+
282
+ # Requires three types of information for tokenizing different datasets
283
+ # 1) Which fields require tokenization, e.g.
284
+ # "text2float": "text", but not "float"
285
+ # "text2text": both "input" and "output"
286
+ # 2) How will there tokenized sequence concatenated together, e.g.
287
+ # "text_only": "text" -> "text"
288
+ # "text2text": "input", "output" -> "input" + "output"
289
+ # 3) Which fields require loss in final computation, e.g.
290
+ # "text_only": "text"
291
+ # "text2text": "output" only
292
+ tokenized_column_order = None # Handles 1) and 2)
293
+ label_columns = None # Handles 3)
294
+ if dataset_type == "text_only":
295
+ tokenized_column_order = ["text"]
296
+ label_columns = ["text"]
297
+ elif dataset_type == "text2text":
298
+ tokenized_column_order = ["input", "output"]
299
+ label_columns = ["output"]
300
+ else:
301
+ raise NotImplementedError(
302
+ f"dataset type \"{dataset_type}\" is not supported, currently"
303
+ " only support following data types:\n"
304
+ f" 1) {TEXT_ONLY_DATASET_DESCRIPTION}\n"
305
+ f" 2) {TEXT2TEXT_DATASET_DESCRIPTION}\n"
306
+ )
307
+
308
+ model_args = self.model_args
309
+ raw_datasets = dataset
310
+ hf_raw_datasets = dataset.get_backend_dataset()
311
+ column_names = list(hf_raw_datasets.features)
312
+
313
+ # since this will be pickled to avoid _LazyModule error in Hasher force
314
+ # logger loading before tokenize_function
315
+ tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
316
+
317
+ def tokenize_function(examples):
318
+ num_example = len(examples[column_names[0]])
319
+ token_dict = {
320
+ "input_ids": [[] for _ in range(num_example)],
321
+ "attention_mask": [[] for _ in range(num_example)],
322
+ "labels": [[] for _ in range(num_example)],
323
+ }
324
+ with CaptureLogger(tok_logger) as cl:
325
+ for column_name in tokenized_column_order:
326
+ encoding = self.tokenizer(
327
+ examples[column_name],
328
+ add_special_tokens=add_special_tokens,
329
+ truncation=True if model_args.use_lora else None,
330
+ )
331
+
332
+ if column_name in label_columns:
333
+ labels = encoding["input_ids"].copy()
334
+ else:
335
+ labels = [
336
+ [-100] * len(encoding["input_ids"][i])
337
+ for i in range(num_example)
338
+ ]
339
+
340
+ for i in range(num_example):
341
+ token_dict["input_ids"][i].extend(
342
+ encoding["input_ids"][i]
343
+ )
344
+ token_dict["attention_mask"][i].extend(
345
+ encoding["attention_mask"][i]
346
+ )
347
+ token_dict["labels"][i].extend(labels[i])
348
+
349
+ # clm input could be much much longer than block_size
350
+ if "Token indices sequence length is longer than the" in cl.out:
351
+ tok_logger.warning(
352
+ "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
353
+ " before being passed to the model."
354
+ )
355
+ return token_dict
356
+
357
+ data_args = raw_datasets.get_data_args()
358
+ if not data_args.streaming:
359
+ tokenized_datasets = raw_datasets.map(
360
+ tokenize_function,
361
+ batched=True,
362
+ num_proc=data_args.preprocessing_num_workers,
363
+ remove_columns=column_names,
364
+ load_from_cache_file=not data_args.overwrite_cache,
365
+ desc="Running tokenizer on dataset",
366
+ )
367
+ else:
368
+ tokenized_datasets = raw_datasets.map(
369
+ tokenize_function,
370
+ batched=True,
371
+ remove_columns=column_names,
372
+ )
373
+ return tokenized_datasets
374
+
375
+
376
+ def encode(self, input: Union[str, List[str]], *args, **kwargs ) -> Union[List[int], List[List[int]]]:
377
+ """
378
+ Perform encoding process of the tokenizer.
379
+
380
+ Parameters
381
+ ------------
382
+ inputs : str or list.
383
+ The text sequence.
384
+
385
+ args : Optional.
386
+ Positional arguments.
387
+
388
+ kwargs : Optional.
389
+ Keyword arguments.
390
+
391
+ Returns
392
+ ------------
393
+ outputs :
394
+ The tokenized inputs.
395
+ """
396
+ if isinstance(input, list):
397
+ output = []
398
+ for single_input in input:
399
+ single_output = self.encode(single_input, *args, **kwargs)
400
+ output.append(single_output)
401
+ return output
402
+ elif isinstance(input, str):
403
+ return self.tokenizer.encode(text=input, *args, **kwargs)
404
+ else:
405
+ raise NotImplementedError(f'type "{type(input)}" cannot be encoded')
406
+
407
+
408
+ def decode(self, input, *args, **kwargs ) -> Union[str, List[str]]:
409
+ """
410
+ Perform decoding process of the tokenizer.
411
+
412
+ Parameters
413
+ ------------
414
+ inputs : list.
415
+ The token sequence.
416
+
417
+ args : Optional.
418
+ Positional arguments.
419
+
420
+ kwargs : Optional.
421
+ Keyword arguments.
422
+
423
+ Returns
424
+ ------------
425
+ outputs :
426
+ The text decoded from the token inputs.
427
+ """
428
+ if isinstance(input, list) and input and isinstance(input[0], list):
429
+ output = []
430
+ for single_input in input:
431
+ single_output = self.decode(single_input, *args, **kwargs)
432
+ output.append(single_output)
433
+ return output
434
+ else:
435
+ # Can be list of ints or a Tensor
436
+ return self.tokenizer.decode(input, *args, **kwargs)
437
+
438
+
439
+ def inference(self, inputs, *args, **kwargs):
440
+ """
441
+ Perform generation process of the model.
442
+
443
+ Parameters
444
+ ------------
445
+ inputs :
446
+ The sequence used as a prompt for the generation or as model inputs to the model.
447
+
448
+ args : Optional.
449
+ Positional arguments.
450
+
451
+ kwargs : Optional.
452
+ Keyword arguments.
453
+
454
+ Returns
455
+ ------------
456
+ outputs :
457
+ The generated sequence output
458
+ """
459
+
460
+
461
+ with torch.no_grad():
462
+ if self.device == "gpu":
463
+ outputs = self.ds_engine.module.generate(
464
+ input_ids=inputs,
465
+ synced_gpus=True,
466
+ pad_token_id=self.tokenizer.eos_token_id,
467
+ *args,
468
+ **kwargs
469
+ )
470
+ elif self.device == "cpu":
471
+ outputs = self.backend_model.generate(
472
+ input_ids=inputs,
473
+ synced_gpus=True,
474
+ pad_token_id=self.tokenizer.eos_token_id,
475
+ *args,
476
+ **kwargs
477
+ )
478
+ else:
479
+ raise NotImplementedError(
480
+ f"device \"{self.device}\" is not supported"
481
+ )
482
+ return outputs
483
+
484
+
485
+ def merge_lora_weights(self):
486
+ if self.model_args.use_lora:
487
+ self.get_backend_model().merge_and_unload()
488
+ else:
489
+ logger.warning("LoRA training is NOT enabled. Merging LoRA weights is not applicable.")
490
+
491
+
492
+ def save(self, dir, save_full_model=False, *args, **kwargs):
493
+ """
494
+ Perform generation process of the model.
495
+
496
+ Parameters
497
+ ------------
498
+ dir :
499
+ The directory to save model and tokenizer
500
+
501
+ save_full_model : Optional.
502
+ Whether to save full model.
503
+
504
+ kwargs : Optional.
505
+ Keyword arguments.
506
+
507
+ Returns
508
+ ------------
509
+ outputs :
510
+ The generated sequence output
511
+ """
512
+ self.get_tokenizer().save_pretrained(dir)
513
+ if save_full_model and self.model_args.use_lora:
514
+ self.backend_model_full.save_pretrained(dir)
515
+ else:
516
+ self.get_backend_model().save_pretrained(dir)
517
+
518
+
519
+ def get_max_length(self):
520
+ """
521
+ Return max acceptable input length in terms of tokens.
522
+ """
523
+ return self.tokenizer.model_max_length
524
+
525
+
526
+ def get_tokenizer(self):
527
+ """
528
+ Return the tokenizer of the model.
529
+ """
530
+ return self.tokenizer
531
+
532
+
533
+ def get_backend_model(self):
534
+ """
535
+ Return the backend model.
536
+ """
537
+ return self.backend_model
lmflow/models/hf_encoder_decoder_model.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """This is a class called HFDecoderModel which is a wrapper around transformers model and
4
+ tokenizer classes. It has several methods such as __init__, tokenize, and train that are
5
+ used for training and fine-tuning the model. The __init__ method takes in several arguments
6
+ such as model_args, tune_strategy, and ds_config, which are used to load the pretrained
7
+ model and tokenizer, and initialize the training settings.
8
+
9
+ The tokenize method is used to tokenize the input text and return the input IDs and attention
10
+ masks that can be fed to the model for training or inference.
11
+
12
+ This class supports different tune_strategy options such as 'normal', 'none', 'lora', and
13
+ 'adapter', which allow for different fine-tuning settings of the model. However, the 'lora'
14
+ and 'adapter' strategies are not yet implemented.
15
+
16
+ Overall, this class provides a convenient interface for loading and fine-tuning transformer
17
+ models and can be used for various NLP tasks such as language modeling, text classification,
18
+ and question answering.
19
+ """
20
+
21
+ import logging
22
+ from typing import List, Union
23
+
24
+ import deepspeed
25
+
26
+ from peft import (
27
+ LoraConfig,
28
+ PeftModel,
29
+ TaskType,
30
+ get_peft_config,
31
+ get_peft_model,
32
+ )
33
+
34
+ import torch
35
+ import transformers
36
+ from transformers.deepspeed import HfDeepSpeedConfig
37
+
38
+ from transformers.testing_utils import CaptureLogger
39
+
40
+ from transformers import (
41
+ CONFIG_MAPPING,
42
+ AutoConfig,
43
+ AutoTokenizer,
44
+ AutoModelForSeq2SeqLM,
45
+ AutoModel,
46
+ )
47
+
48
+ from lmflow.datasets.dataset import Dataset
49
+ from lmflow.models.encoder_decoder_model import EncoderDecoderModel
50
+ from lmflow.models.interfaces.tunable import Tunable
51
+
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ class HFEncoderDecoderModel(EncoderDecoderModel, Tunable):
57
+ r"""
58
+ Initializes a HFEncoderDecoderModel instance.
59
+
60
+ Parameters
61
+ ------------
62
+
63
+ model_args :
64
+ Model arguments such as model name, path, revision, etc.
65
+
66
+ tune_strategy : str or none, default="normal".
67
+ A string representing the dataset backend. Defaults to "huggingface".
68
+
69
+ ds_config :
70
+ Deepspeed configuations.
71
+
72
+ args : Optional.
73
+ Positional arguments.
74
+
75
+ kwargs : Optional.
76
+ Keyword arguments.
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ model_args,
82
+ tune_strategy='normal',
83
+ ds_config=None,
84
+ device="gpu",
85
+ *args,
86
+ **kwargs
87
+ ):
88
+ """
89
+ Initializes a HFDecoderModel instance.
90
+ :param model_args: dictionary with model arguments such as model name, path, revision, etc.
91
+ :param tune_strategy: tuning strategy: normal, none, lora or adapter
92
+ :param ds_config: deepspeed configuration for distributed training
93
+ """
94
+
95
+ # See more about loading any type of standard or custom dataset (from
96
+ # files, python dict, pandas DataFrame, etc) at
97
+ # https://huggingface.co/docs/datasets/loading_datasets.html.
98
+
99
+ # Load pretrained model and tokenizer
100
+ #
101
+ # Distributed training: The .from_pretrained methods guarantee that
102
+ # only one local process can concurrently download model & vocab.
103
+
104
+ self.device = device
105
+
106
+ if tune_strategy == 'normal':
107
+ raise NotImplementedError(
108
+ f"tune_strategy \"{tune_strategy}\" is not supported"
109
+ )
110
+ elif tune_strategy == 'none':
111
+ dschf = HfDeepSpeedConfig(ds_config)
112
+ peft_model_id = model_args.lora_model_path
113
+ # NOTE: Currently offload is not supported by llama
114
+ if "llama" in model_args.model_name_or_path and model_args.use_ram_optimized_load:
115
+ logger.warning(
116
+ "llama does not support RAM optimized load. Automatically"
117
+ " use original load instead."
118
+ )
119
+ model_args.use_ram_optimized_load = False
120
+
121
+
122
+ if model_args.model_name_or_path == 'THUDM/chatglm-6b':
123
+ self.backend_model = AutoModel.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
124
+
125
+ elif model_args.use_ram_optimized_load and peft_model_id is None:
126
+ try:
127
+ # RAM-optimized load
128
+ self.backend_model = AutoModelForSeq2SeqLM.from_pretrained(
129
+ model_args.model_name_or_path,
130
+ device_map="auto",
131
+ offload_folder="offload",
132
+ offload_state_dict=True,
133
+ )
134
+ except:
135
+ logger.warning(
136
+ "Failed to use RAM optimized load. Automatically"
137
+ " use original load instead."
138
+ )
139
+ # Normal load
140
+ self.backend_model = AutoModelForSeq2SeqLM.from_pretrained(
141
+ model_args.model_name_or_path,
142
+ )
143
+ else:
144
+ if peft_model_id is not None:
145
+ logger.warning(
146
+ "LoRA does not support RAM optimized load currently."
147
+ " Automatically use original load instead."
148
+ )
149
+ self.backend_model = AutoModelForSeq2SeqLM.from_pretrained(
150
+ model_args.model_name_or_path,
151
+ )
152
+
153
+ self.tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
154
+ self.backend_model_full = self.backend_model
155
+ if peft_model_id is not None:
156
+ self.backend_model = PeftModel.from_pretrained(
157
+ self.backend_model, peft_model_id
158
+ )
159
+
160
+ if device == "gpu":
161
+ deepspeed.init_distributed()
162
+ self.ds_engine = deepspeed.initialize(model=self.backend_model, config_params=ds_config)[0]
163
+ self.ds_engine.module.eval()
164
+
165
+ elif tune_strategy == 'adapter':
166
+ raise NotImplementedError('adapter tune strategy not implemented')
167
+
168
+
169
+ def tokenize(self, dataset, *args, **kwargs):
170
+ """
171
+ Tokenize the full dataset.
172
+
173
+ Parameters
174
+ ------------
175
+ dataset :
176
+ Text dataset.
177
+
178
+ args : Optional.
179
+ Positional arguments.
180
+
181
+ kwargs : Optional.
182
+ Keyword arguments.
183
+
184
+ Returns
185
+ ------------
186
+ tokenized_datasets :
187
+ The tokenized dataset.
188
+ """
189
+ raise NotImplementedError('tokenize not implemented')
190
+
191
+ def encode(self, input: Union[str, List[str]], *args, **kwargs ) -> Union[List[int], List[List[int]]]:
192
+ """
193
+ Perform encoding process of the tokenizer.
194
+
195
+ Parameters
196
+ ------------
197
+ inputs : str or list.
198
+ The text sequence.
199
+
200
+ args : Optional.
201
+ Positional arguments.
202
+
203
+ kwargs : Optional.
204
+ Keyword arguments.
205
+
206
+ Returns
207
+ ------------
208
+ outputs :
209
+ The tokenized inputs.
210
+ """
211
+ if isinstance(input, list):
212
+ output = []
213
+ for single_input in input:
214
+ single_output = self.encode(single_input, *args, **kwargs)
215
+ output.append(single_output)
216
+ return output
217
+ elif isinstance(input, str):
218
+ return self.tokenizer.encode(text=input, *args, **kwargs)
219
+ else:
220
+ raise NotImplementedError(f'type "{type(input)}" cannot be encoded')
221
+
222
+
223
+ def decode(self, input, *args, **kwargs ) -> Union[str, List[str]]:
224
+ """
225
+ Perform decoding process of the tokenizer.
226
+
227
+ Parameters
228
+ ------------
229
+ inputs : list.
230
+ The token sequence.
231
+
232
+ args : Optional.
233
+ Positional arguments.
234
+
235
+ kwargs : Optional.
236
+ Keyword arguments.
237
+
238
+ Returns
239
+ ------------
240
+ outputs :
241
+ The text decoded from the token inputs.
242
+ """
243
+ if isinstance(input, list) and input and isinstance(input[0], list):
244
+ output = []
245
+ for single_input in input:
246
+ single_output = self.decode(single_input, *args, **kwargs)
247
+ output.append(single_output)
248
+ return output
249
+ else:
250
+ # Can be list of ints or a Tensor
251
+ return self.tokenizer.decode(input, *args, **kwargs)
252
+
253
+
254
+ def inference(self, inputs, *args, **kwargs):
255
+ """
256
+ Perform generation process of the model.
257
+
258
+ Parameters
259
+ ------------
260
+ inputs :
261
+ The sequence used as a prompt for the generation or as model inputs to the model.
262
+
263
+ args : Optional.
264
+ Positional arguments.
265
+
266
+ kwargs : Optional.
267
+ Keyword arguments.
268
+
269
+ Returns
270
+ ------------
271
+ outputs :
272
+ The generated sequence output
273
+ """
274
+
275
+
276
+ with torch.no_grad():
277
+ if self.device == "gpu":
278
+ outputs = self.ds_engine.module.generate(
279
+ input_ids=inputs,
280
+ synced_gpus=True,
281
+ pad_token_id=self.tokenizer.eos_token_id,
282
+ *args,
283
+ **kwargs
284
+ )
285
+ elif self.device == "cpu":
286
+ outputs = self.backend_model.generate(
287
+ input_ids=inputs,
288
+ synced_gpus=True,
289
+ pad_token_id=self.tokenizer.eos_token_id,
290
+ *args,
291
+ **kwargs
292
+ )
293
+ else:
294
+ raise NotImplementedError(
295
+ f"device \"{self.device}\" is not supported"
296
+ )
297
+ return outputs
298
+
299
+
300
+ def merge_lora_weights(self):
301
+ if self.model_args.use_lora:
302
+ self.get_backend_model().merge_and_unload()
303
+ else:
304
+ logger.warning("LoRA training is NOT enabled. Merging LoRA weights is not applicable.")
305
+
306
+
307
+ def save(self, dir, save_full_model=False, *args, **kwargs):
308
+ """
309
+ Perform generation process of the model.
310
+
311
+ Parameters
312
+ ------------
313
+ dir :
314
+ The directory to save model and tokenizer
315
+
316
+ save_full_model : Optional.
317
+ Whether to save full model.
318
+
319
+ kwargs : Optional.
320
+ Keyword arguments.
321
+
322
+ Returns
323
+ ------------
324
+ outputs :
325
+ The generated sequence output
326
+ """
327
+ self.get_tokenizer().save_pretrained(dir)
328
+ if save_full_model and self.model_args.use_lora:
329
+ self.backend_model_full.save_pretrained(dir)
330
+ else:
331
+ self.get_backend_model().save_pretrained(dir)
332
+
333
+
334
+ def get_max_length(self):
335
+ """
336
+ Return max acceptable input length in terms of tokens.
337
+ """
338
+ return self.tokenizer.model_max_length
339
+
340
+
341
+ def get_tokenizer(self):
342
+ """
343
+ Return the tokenizer of the model.
344
+ """
345
+ return self.tokenizer
346
+
347
+
348
+ def get_backend_model(self):
349
+ """
350
+ Return the backend model.
351
+ """
352
+ return self.backend_model
lmflow/models/interfaces/__init__.py ADDED
File without changes
lmflow/models/interfaces/tunable.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """Tunable class
4
+ """
5
+
6
+ from abc import ABC
7
+
8
+
9
+ class Tunable(ABC):
10
+ pass
lmflow/models/regression_model.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """General regression model."""
4
+
5
+ from lmflow.models.base_model import BaseModel
6
+
7
+
8
+ class RegressionModel(BaseModel):
9
+
10
+ def __init__(self, *args, **kwargs):
11
+ pass
lmflow/models/text_regression_model.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ A model maps "text_only" data to float.
5
+ """
6
+
7
+ from lmflow.models.regression_model import RegressionModel
8
+ from lmflow.datasets.dataset import Dataset
9
+
10
+
11
+ class TextRegressionModel(RegressionModel):
12
+ r"""
13
+ Initializes a TextRegressionModel instance.
14
+
15
+ Parameters
16
+ ------------
17
+
18
+ model_args :
19
+ Model arguments such as model name, path, revision, etc.
20
+
21
+ args : Optional.
22
+ Positional arguments.
23
+
24
+ kwargs : Optional.
25
+ Keyword arguments.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ model_args,
31
+ *args,
32
+ **kwargs
33
+ ):
34
+ """
35
+ Initializes a TextRegressionModel instance.
36
+ :param model_args: dictionary with model arguments such as model name, path, revision, etc.
37
+ """
38
+ self.inference_func = None
39
+
40
+
41
+ def register_inference_function(self, inference_func):
42
+ """
43
+ Registers a regression function.
44
+ """
45
+ self.inference_func = inference_func
46
+
47
+
48
+ def inference(self, inputs: Dataset):
49
+ """
50
+ Gets regression results of a given dataset.
51
+
52
+ :inputs: Dataset object, only accept type "text_only".
53
+ """
54
+ if self.inference_func is not None:
55
+ return self.inference_func(inputs)
56
+ else:
57
+ pass
lmflow/pipeline/__init__.py ADDED
File without changes
lmflow/pipeline/auto_pipeline.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """Return a pipeline automatically based on its name.
4
+ """
5
+
6
+ from lmflow.pipeline.evaluator import Evaluator
7
+ from lmflow.pipeline.finetuner import Finetuner
8
+ from lmflow.pipeline.inferencer import Inferencer
9
+ from lmflow.pipeline.raft_aligner import RaftAligner
10
+
11
+
12
+ PIPELINE_MAPPING = {
13
+ "evaluator": Evaluator,
14
+ "finetuner": Finetuner,
15
+ "inferencer": Inferencer,
16
+ "raft_aligner": RaftAligner,
17
+ }
18
+
19
+
20
+ class AutoPipeline:
21
+ """
22
+ The class designed to return a pipeline automatically based on its name.
23
+ """
24
+ @classmethod
25
+ def get_pipeline(self,
26
+ pipeline_name,
27
+ model_args,
28
+ data_args,
29
+ pipeline_args,
30
+ *args,
31
+ **kwargs
32
+ ):
33
+ if pipeline_name not in PIPELINE_MAPPING:
34
+ raise NotImplementedError(
35
+ f'Pipeline "{pipeline_name}" is not supported'
36
+ )
37
+
38
+ pipeline = PIPELINE_MAPPING[pipeline_name](
39
+ model_args,
40
+ data_args,
41
+ pipeline_args,
42
+ *args,
43
+ **kwargs
44
+ )
45
+ return pipeline
lmflow/pipeline/base_aligner.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """ BaseTuner: a subclass of BasePipeline.
4
+ """
5
+
6
+ from lmflow.pipeline.base_pipeline import BasePipeline
7
+
8
+
9
+ class BaseAligner(BasePipeline):
10
+ """ A subclass of BasePipeline which is alignable.
11
+ """
12
+ def __init__(self, *args, **kwargs):
13
+ pass
14
+
15
+ def _check_if_alignable(self, model, dataset, reward_model):
16
+ # TODO: check if the model is alignable and dataset is compatible
17
+ # TODO: add reward_model
18
+ pass
19
+
20
+ def align(self, model, dataset, reward_model):
21
+ raise NotImplementedError(".align is not implemented")
lmflow/pipeline/base_pipeline.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """ BasePipeline.
4
+ """
5
+
6
+ from abc import ABC # abstract class
7
+
8
+ class BasePipeline(ABC):
9
+ pass
lmflow/pipeline/base_tuner.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """ BaseTuner: a subclass of BasePipeline.
4
+ """
5
+
6
+ from lmflow.pipeline.base_pipeline import BasePipeline
7
+
8
+
9
+ class BaseTuner(BasePipeline):
10
+ """ A subclass of BasePipeline which is tunable.
11
+ """
12
+ def __init__(self, *args, **kwargs):
13
+ pass
14
+
15
+ def _check_if_tunable(self, model, dataset):
16
+ # TODO: check if the model is tunable and dataset is compatible
17
+ pass
18
+
19
+ def tune(self, model, dataset):
20
+ raise NotImplementedError(".tune is not implemented")
lmflow/pipeline/evaluator.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The Evaluator class simplifies the process of running evaluation on a language model provided by a HFDecoderModel instance imported from the lmflow package. The class constructor takes three dictionaries as arguments: model_args containing arguments related to the language model, data_args containing arguments related to the data used for evaluation, and evaluator_args containing other arguments for the evaluation process.
2
+
3
+ The class has two methods: create_dataloader() that loads the data from the test file, creates a data loader, and returns it with the size of the data, and evaluate(model) that generates output text given input text. It uses the create_dataloader() method to load the data, iterates over the data in mini-batches, and encodes the input text with the encode() method of the HFDecoderModel class. Then, it generates output text using the evaluate() method of the HFDecoderModel class, decodes the generated output text using the decode() method of the HFDecoderModel class, and writes the output to a file in the output directory. The method also logs some information to the console and Weights and Biases if the use_wandb argument is True.
4
+ """
5
+ import os
6
+ # import deepspeed
7
+ import torch
8
+ import wandb
9
+ import deepspeed
10
+ import sys
11
+ import numpy as np
12
+ import datetime
13
+ import json
14
+ # TODO: remove later
15
+ from transformers import AutoConfig
16
+ import torch.distributed as dist
17
+
18
+ from lmflow.datasets.dataset import Dataset
19
+ from lmflow.pipeline.base_pipeline import BasePipeline
20
+ from lmflow.models.hf_decoder_model import HFDecoderModel
21
+ from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction
22
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # To avoid warnings about parallelism in tokenizers
23
+
24
+ class Evaluator(BasePipeline):
25
+ """
26
+ Initializes the `Evaluator` class with given arguments.
27
+
28
+ Parameters
29
+ ------------
30
+ model_args : ModelArguments object.
31
+ Contains the arguments required to load the model.
32
+
33
+ data_args : DatasetArguments object.
34
+ Contains the arguments required to load the dataset.
35
+
36
+ evaluator_args : EvaluatorArguments object.
37
+ Contains the arguments required to perform evaluation.
38
+
39
+
40
+ """
41
+ def __init__(self, model_args, data_args, evaluator_args):
42
+ # our method
43
+ self.data_args = data_args
44
+ self.evaluator_args = evaluator_args
45
+ self.model_args = model_args
46
+ print("--------Begin Evaluator Arguments----------")
47
+ print(f"model_args : {self.model_args}")
48
+ print(f"data_args : {self.data_args}")
49
+ print(f"evaluator_args : {self.evaluator_args}")
50
+ print("--------End Evaluator Arguments----------")
51
+ # logger
52
+ if(self.evaluator_args.use_wandb == True):
53
+ wandb.init(project="lmflow_evaluation")
54
+ # random seed
55
+ set_random_seed(self.evaluator_args.random_seed)
56
+ self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
57
+ self.world_size = int(os.getenv("WORLD_SIZE", "1"))
58
+ torch.cuda.set_device(self.local_rank) # NOTE: cpu-only machine will have error
59
+ deepspeed.init_distributed()
60
+
61
+ self.config = AutoConfig.from_pretrained(model_args.model_name_or_path)
62
+ try:
63
+ self.model_hidden_size = self.config.hidden_size
64
+ except:
65
+ print("Error in setting hidden size, use the default size 1024")
66
+ self.model_hidden_size = 1024 # gpt2 seems do not have hidden_size in config
67
+
68
+ print(f"model_hidden_size = {self.model_hidden_size}")
69
+ # batch size has to be divisible by world_size, but can be bigger than world_size
70
+ train_batch_size = 1 * self.world_size
71
+ self.evaluator_args.minibatch_size = train_batch_size
72
+ self.block_size = evaluator_args.evaluate_block_size
73
+ # dataloader, data_size = create_dataloader(args) # load dataset
74
+
75
+
76
+ def create_dataloader(self, dataset: Dataset):
77
+ data_dict = dataset.to_dict()
78
+ inputs = [ instance["input"] for instance in data_dict["instances"] ]
79
+ outputs = [ instance["output"] for instance in data_dict["instances"] ]
80
+ dataset_size = len(outputs)
81
+ dataset_buf = []
82
+ for idx in range(dataset_size):
83
+ dataset_buf.append({
84
+ "input": inputs[idx],
85
+ "output": outputs[idx],
86
+ "input_idx": idx
87
+ })
88
+
89
+ dataloader = batchlize(
90
+ dataset_buf,
91
+ self.evaluator_args.minibatch_size,
92
+ self.evaluator_args.random_shuffle
93
+ )
94
+ print(f"Successfully create dataloader with size {len(dataloader)}.")
95
+ return dataloader, dataset_size
96
+
97
+
98
+ # TODO: Split for better unittest
99
+
100
+ def _match(self, predicted_answer, groundtruth, answer_type=None):
101
+ case_insensitive_types = [
102
+ "strategyqa",
103
+ "coin_flip",
104
+ "pubmedqa",
105
+ "binary_choice",
106
+ "medmcqa",
107
+ "usmle",
108
+ ]
109
+ if answer_type in case_insensitive_types:
110
+ return predicted_answer.lower() == groundtruth.lower()
111
+ else:
112
+ return predicted_answer == groundtruth
113
+ return False
114
+
115
+
116
+ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
117
+ """
118
+ Perform Evaluation for a model
119
+
120
+ Parameters
121
+ ------------
122
+ model : TunableModel object.
123
+ TunableModel to perform inference
124
+
125
+ dataset : Dataset object.
126
+
127
+
128
+ """
129
+ if metric in ["acc", "accuracy"]:
130
+ dataloader, data_size = self.create_dataloader(dataset)
131
+
132
+ if not dist.is_initialized() or dist.get_rank() == 0:
133
+ if not os.path.exists(self.evaluator_args.output_dir):
134
+ os.makedirs(self.evaluator_args.output_dir)
135
+ output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
136
+
137
+ acc_list = []
138
+ total = 0
139
+ # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
140
+ # ds_engine.module.eval()
141
+ for batch_index, batch in enumerate(dataloader):
142
+ if batch_index * self.world_size >= self.data_args.max_eval_samples:
143
+ break
144
+ if self.local_rank >= len(batch):
145
+ current_batch = batch[0]
146
+ else:
147
+ # the batch in current process
148
+ current_batch = batch[self.local_rank]
149
+
150
+ prompt_structure = self.evaluator_args.prompt_structure
151
+ input = prompt_structure.format(input=current_batch['input'])
152
+ output = current_batch['output']
153
+ input_idx = current_batch['input_idx']
154
+
155
+ inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
156
+
157
+
158
+ # with torch.no_grad():
159
+ # outputs = ds_engine.module.generate(inputs, synced_gpus=True, pad_token_id=model.get_tokenizer().eos_token_id, min_length=5, max_length=100,temperature=0.0, do_sample=False)
160
+ outputs = model.inference(inputs, max_new_tokens=100, temperature=0.0)
161
+ text_out = model.decode(outputs[0], skip_special_tokens=True)
162
+
163
+ # # only return the generation, trucating the input
164
+ prompt_length = len(model.decode(inputs[0], skip_special_tokens=True,))
165
+ text_out = text_out[prompt_length:]
166
+ answer_type = self.evaluator_args.answer_type
167
+ pred_answer = answer_extraction(
168
+ text_out,
169
+ answer_type=answer_type,
170
+ )
171
+ print(f"batch_index{batch_index} rank{self.local_rank}:\n question={input}\n prediction={text_out}\n")
172
+ print(f"predicted answer: {pred_answer} \n")
173
+ print(f"groundtruth answer: {output} \n")
174
+
175
+ if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and donot contribute to the accuracy
176
+ correct_ = 0
177
+ total_ = 0
178
+ else:
179
+ correct_ = 0
180
+ total_ = 1
181
+ if self._match(pred_answer, output, answer_type):
182
+ correct_ = 1
183
+
184
+ # collect accuracy from all gpus
185
+ all_process = torch.tensor([correct_, total_], dtype=torch.float32, device=self.local_rank)
186
+ dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
187
+ correct_, total_ = all_process.tolist()
188
+ avg = correct_ / total_
189
+ acc_list.append(avg)
190
+ total += total_
191
+
192
+ # collect predictions from all gpus
193
+ output_dict = {"question": input,
194
+ "prediction": text_out,
195
+ "pred_answer": pred_answer,
196
+ "answer": output}
197
+ all_process_list = [{}] * self.world_size
198
+
199
+ dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0)
200
+ if not dist.is_initialized() or dist.get_rank() == 0:
201
+ current_accuracy = np.mean(acc_list)
202
+ print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
203
+
204
+ if(self.evaluator_args.use_wandb == True):
205
+ wandb.log({"Accuracy": current_accuracy})
206
+
207
+ for index, output in enumerate(all_process_list):
208
+ output_json = json.dumps(output)
209
+ output_writer.write(output_json + '\n')
210
+
211
+ if not dist.is_initialized() or dist.get_rank() == 0:
212
+ current_accuracy = np.mean(acc_list)
213
+ print("Final accuracy = ", current_accuracy)
214
+ output_writer.close()
215
+ elif metric in ["ppl", "perplexity"]:
216
+ ppl = self._evaluate_ppl(model, dataset)
217
+ print(f"Evaluating final ppl: {ppl}")
218
+ elif metric in ["nll", "neg_log_likelihood"]:
219
+ neg_log_likelihood = self._evaluate_neg_log_likelihood(model, dataset)
220
+ print(f"Evaluating final negative log likelihood: {neg_log_likelihood}")
221
+ else:
222
+ raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
223
+
224
+
225
+ def _evaluate_ppl(self, model, dataset: Dataset):
226
+ data_dict = dataset.to_dict()
227
+ if data_dict['type'] == 'text2text':
228
+ raise NotImplementedError("ppl evaluation is currently not supported for text2text dataset, please use text_only dataset.")
229
+ texts = [ instance["text"] for instance in data_dict["instances"] ]
230
+ encodings = model.get_tokenizer()("\n\n".join(texts), return_tensors="pt")
231
+ # Define some constant
232
+ try:
233
+ max_length = min(model.get_backend_model().config.n_positions, model.get_max_length())
234
+ except:
235
+ max_length = min(1024, model.get_max_length())
236
+
237
+ print(f"The maximum sequence length : {max_length}")
238
+ seq_len = encodings.input_ids.size(1)
239
+
240
+ nlls = []
241
+ prev_end_loc = 0
242
+ for begin_loc in range(0, seq_len, self.block_size):
243
+ end_loc = min(begin_loc + max_length, seq_len)
244
+ trg_len = end_loc - prev_end_loc # may be different from block_size on last loop
245
+ input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device=self.local_rank)
246
+ target_ids = input_ids.clone()
247
+ target_ids[:, :-trg_len] = -100
248
+
249
+ with torch.no_grad():
250
+ outputs = model.get_backend_model()(input_ids, labels=target_ids)
251
+ # loss is calculated using CrossEntropyLoss which averages over valid labels
252
+ # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
253
+ # to the left by 1.
254
+ neg_log_likelihood = outputs.loss
255
+
256
+ nlls.append(neg_log_likelihood)
257
+ prev_end_loc = end_loc
258
+ print(f"Evaluating PPL: {int(begin_loc/self.block_size) + 1} / {int(seq_len/self.block_size)} Complete, current ppl : {torch.exp(torch.stack(nlls).mean())}")
259
+ if end_loc == seq_len:
260
+ break
261
+ ppl = torch.exp(torch.stack(nlls).mean())
262
+ return ppl
263
+
264
+
265
+ def _evaluate_neg_log_likelihood(self, model, dataset: Dataset):
266
+ """
267
+ Evaluates negative log likelihood of the model over a dataset.
268
+
269
+ NLL = -1/N sum_{i=1}^N sum_{j=1}^|w_i| ln(p(w_{i,j}|context_window)),
270
+
271
+ where N is the number of data samples, w_{i,j} is the j-th token in
272
+ i-th sample. Here "context_window" = p(w_{i,start}, w_{i,start+1}, ...,
273
+ p_{i,j-1} with start = max(0, j - window_length + 1). "window_length"
274
+ is normally the maximum length accepted by the model.
275
+
276
+ Returns:
277
+ A float which represents the negative log likelihood.
278
+ """
279
+ data_dict = dataset.to_dict()
280
+
281
+ # Handles prompt structure
282
+ if dataset.get_type() == "text2text":
283
+ prompt = self.evaluator_args.prompt_structure
284
+ data_dict["instances"] = [
285
+ {
286
+ "input": prompt.format(input=instance["input"]),
287
+ "output": instance["output"]
288
+ }
289
+ for instance in data_dict["instances"]
290
+ ]
291
+
292
+ dataset = dataset.from_dict(data_dict)
293
+ tokenized_dataset = model.tokenize(dataset, add_special_tokens=False)
294
+ tokenized_dataset = tokenized_dataset.get_backend_dataset()
295
+ encoding_list = [
296
+ {
297
+ "input_ids": torch.tensor([input_ids]),
298
+ "labels": torch.tensor([labels]),
299
+ }
300
+ for input_ids, labels in zip(tokenized_dataset["input_ids"],
301
+ tokenized_dataset["labels"])
302
+ ]
303
+
304
+ # Gets context window length
305
+ try:
306
+ max_length = min(model.get_backend_model().config.n_positions,
307
+ model.get_max_length())
308
+ except:
309
+ max_length = min(1024, model.get_max_length())
310
+
311
+ nlls = []
312
+ full_nlls = []
313
+ num_samples = len(encoding_list)
314
+ for sample_idx, encodings in enumerate(encoding_list):
315
+ seq_len = encodings["input_ids"].size(1)
316
+
317
+ prev_end_loc = 0
318
+ for begin_loc in range(0, seq_len, self.block_size):
319
+ end_loc = min(begin_loc + max_length, seq_len)
320
+
321
+ # may be different from block_size on last loop
322
+ trg_len = end_loc - prev_end_loc
323
+ input_ids = encodings["input_ids"][:, begin_loc:end_loc]
324
+ input_ids = input_ids.to(device=self.local_rank)
325
+
326
+ labels = encodings["labels"][:, begin_loc:end_loc]
327
+ target_ids = labels.clone()
328
+ full_target_ids = input_ids.clone()
329
+
330
+ def get_nll(label_ids, nll_list):
331
+ label_ids[:, :-trg_len] = -100
332
+ label_ids = label_ids.to(device=self.local_rank)
333
+
334
+ # Valid labels are from 0 to `vocab_size`
335
+ num_valid_labels = torch.count_nonzero(label_ids >= 0)
336
+ if label_ids[0, 0] != -100:
337
+ num_valid_labels -= 1
338
+
339
+ if not torch.all(label_ids == -100):
340
+ with torch.no_grad():
341
+ outputs = model.get_backend_model()(
342
+ input_ids, labels=label_ids
343
+ )
344
+ # loss is calculated using CrossEntropyLoss which
345
+ # sums over valid labels N.B. the model only
346
+ # calculates loss over trg_len - 1 labels, because
347
+ # it internally shifts the labels to the left by 1.
348
+ neg_log_likelihood = outputs.loss * num_valid_labels
349
+ else:
350
+ neg_log_likelihood = torch.zeros([]).to(
351
+ device=self.local_rank
352
+ )
353
+
354
+ nll_list.append(neg_log_likelihood)
355
+
356
+ get_nll(target_ids, nlls)
357
+ get_nll(full_target_ids, full_nlls)
358
+
359
+ current_output_nll = torch.stack(nlls).sum() / (sample_idx + 1)
360
+ current_full_nll = torch.stack(full_nlls).sum() / (sample_idx + 1)
361
+
362
+ prev_end_loc = end_loc
363
+ if dataset.get_type() == "text_only":
364
+ print(
365
+ f"Evaluating negative log likelihood:"
366
+ f" {sample_idx + 1} / {num_samples} Complete,"
367
+ f" current nll: {current_full_nll}"
368
+ )
369
+ elif dataset.get_type() == "text2text":
370
+ print(
371
+ f"Evaluating negative log likelihood:"
372
+ f" {sample_idx + 1} / {num_samples} Complete,"
373
+ f" current full nll / input nll / output nll:"
374
+ f" {current_full_nll} /"
375
+ f" {current_full_nll - current_output_nll} /"
376
+ f" {current_output_nll}"
377
+ )
378
+ else:
379
+ raise NotImplementedError(
380
+ "f{dataset.get_type()} typed datasets are not supported"
381
+ )
382
+
383
+ if end_loc == seq_len:
384
+ break
385
+
386
+ mean_nll = torch.stack(nlls).sum() / num_samples
387
+ return mean_nll
lmflow/pipeline/finetuner.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """The Finetuner class simplifies the process of running finetuning process on a language model for a TunableModel instance with given dataset.
4
+ """
5
+
6
+ import logging
7
+ import os
8
+ import sys
9
+
10
+ import datasets
11
+ import transformers
12
+
13
+ from itertools import chain
14
+ from transformers import (
15
+ Trainer,
16
+ default_data_collator,
17
+ set_seed,
18
+ )
19
+ from transformers.utils import send_example_telemetry
20
+
21
+ from lmflow.datasets.dataset import Dataset
22
+ from lmflow.pipeline.base_tuner import BaseTuner
23
+
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class Finetuner(BaseTuner):
29
+ """
30
+ Initializes the `Finetuner` class with given arguments.
31
+
32
+ Parameters
33
+ ------------
34
+ model_args : ModelArguments object.
35
+ Contains the arguments required to load the model.
36
+
37
+ data_args : DatasetArguments object.
38
+ Contains the arguments required to load the dataset.
39
+
40
+ finetuner_args : FinetunerArguments object.
41
+ Contains the arguments required to perform finetuning.
42
+
43
+ args : Optional.
44
+ Positional arguments.
45
+
46
+ kwargs : Optional.
47
+ Keyword arguments.
48
+
49
+ """
50
+ def __init__(self, model_args, data_args, finetuner_args, *args, **kwargs):
51
+
52
+ self.model_args = model_args
53
+ self.data_args = data_args
54
+ self.finetuner_args = finetuner_args
55
+
56
+ # Sending telemetry. Tracking the example usage helps us better
57
+ # allocate resources to maintain them. The information sent is the one
58
+ # passed as arguments along with your Python/PyTorch versions.
59
+ send_example_telemetry("run_clm", model_args, data_args)
60
+
61
+ # Setup logging
62
+ logging.basicConfig(
63
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
64
+ datefmt="%m/%d/%Y %H:%M:%S",
65
+ handlers=[logging.StreamHandler(sys.stdout)],
66
+ )
67
+
68
+ log_level = finetuner_args.get_process_log_level()
69
+ logger.setLevel(log_level)
70
+ datasets.utils.logging.set_verbosity(log_level)
71
+ transformers.utils.logging.set_verbosity(log_level)
72
+ transformers.utils.logging.enable_default_handler()
73
+ transformers.utils.logging.enable_explicit_format()
74
+
75
+ # Log on each process the small summary:
76
+ logger.warning(
77
+ f"Process rank: {finetuner_args.local_rank},"
78
+ f" device: {finetuner_args.device},"
79
+ f" n_gpu: {finetuner_args.n_gpu}"
80
+ f"distributed training: {bool(finetuner_args.local_rank != -1)},"
81
+ f" 16-bits training: {finetuner_args.fp16}"
82
+ )
83
+ logger.info(f"Training/evaluation parameters {finetuner_args}")
84
+
85
+ # Detecting last checkpoint.
86
+ last_checkpoint = None
87
+ if os.path.isdir(finetuner_args.output_dir) and finetuner_args.do_train and not finetuner_args.overwrite_output_dir:
88
+ last_checkpoint = get_last_checkpoint(finetuner_args.output_dir)
89
+ if last_checkpoint is None and len(os.listdir(finetuner_args.output_dir)) > 0:
90
+ raise ValueError(
91
+ f"Output directory ({finetuner_args.output_dir}) already"
92
+ " exists and is not empty. "
93
+ "Use --overwrite_output_dir to overcome."
94
+ )
95
+ elif last_checkpoint is not None and finetuner_args.resume_from_checkpoint is None:
96
+ logger.info(
97
+ f"Checkpoint detected, resuming training at"
98
+ f" {last_checkpoint}. To avoid this behavior, change"
99
+ " the `--output_dir` or add `--overwrite_output_dir` to"
100
+ " train from scratch."
101
+ )
102
+ self.last_checkpoint = last_checkpoint
103
+
104
+ # Set seed before initializing model.
105
+ set_seed(finetuner_args.seed)
106
+
107
+
108
+ def group_text(self, tokenized_datasets, model_max_length):
109
+ """
110
+ Groups texts together to form blocks of maximum length `model_max_length` and returns the processed data as
111
+ a dictionary.
112
+ """
113
+ data_args = self.data_args
114
+ finetuner_args = self.finetuner_args
115
+
116
+ if data_args.block_size is None:
117
+ block_size = model_max_length
118
+ if block_size > 1024:
119
+ logger.warning(
120
+ "The chosen tokenizer supports a `model_max_length` that is"
121
+ " longer than the default `block_size` value"
122
+ " of 1024. If you would like to use a longer `block_size`"
123
+ " up to `tokenizer.model_max_length` you can override this "
124
+ " default with `--block_size xxx`."
125
+ )
126
+ block_size = 1024
127
+ else:
128
+ if data_args.block_size > model_max_length:
129
+ logger.warning(
130
+ f"The block_size passed ({data_args.block_size}) is larger"
131
+ f" than the maximum length for the model"
132
+ f"({model_max_length})."
133
+ f" Using block_size={model_max_length}."
134
+ )
135
+ block_size = min(data_args.block_size, model_max_length)
136
+
137
+ # Main data processing function that will concatenate all texts from
138
+ # our dataset and generate chunks of block_size.
139
+ def group_texts(examples):
140
+ # Concatenate all texts.
141
+ concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
142
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
143
+ # We drop the small remainder, we could add padding if the model
144
+ # supported it instead of this drop, you can customize this part to
145
+ # your needs.
146
+ total_length = (total_length // block_size) * block_size
147
+ # Split by chunks of max_len.
148
+ result = {
149
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
150
+ for k, t in concatenated_examples.items()
151
+ }
152
+ return result
153
+
154
+ # Note that with `batched=True`, this map processes 1,000 texts
155
+ # together, so group_texts throws away a remainder for each of those
156
+ # groups of 1,000 texts. You can adjust that batch_size here but a
157
+ # higher value might be slower to preprocess.
158
+ #
159
+ # To speed up this part, we use multiprocessing. See the documentation
160
+ # of the map method for more information:
161
+ # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
162
+ with finetuner_args.main_process_first(desc="grouping texts together"):
163
+ group_batch_size = 1000
164
+ if data_args.disable_group_texts:
165
+ group_batch_size = 1
166
+ if not data_args.streaming:
167
+ lm_datasets = tokenized_datasets.map(
168
+ group_texts,
169
+ batched=True,
170
+ batch_size=group_batch_size,
171
+ num_proc=data_args.preprocessing_num_workers,
172
+ load_from_cache_file=not data_args.overwrite_cache,
173
+ desc=f"Grouping texts in chunks of {block_size}",
174
+ )
175
+ else:
176
+ lm_datasets = tokenized_datasets.map(
177
+ group_texts,
178
+ batched=True,
179
+ batch_size=group_batch_size,
180
+ )
181
+
182
+ return lm_datasets
183
+
184
+
185
+ def tune(self, model, dataset):
186
+ """
187
+ Perform tuning for a model
188
+
189
+ Parameters
190
+ ------------
191
+ model : TunableModel object.
192
+ TunableModel to perform tuning.
193
+
194
+ dataset:
195
+ dataset to train model.
196
+
197
+ """
198
+ model_args = self.model_args
199
+ data_args = self.data_args
200
+ finetuner_args = self.finetuner_args
201
+
202
+ # Tokenization and text grouping must be done in the main process
203
+ with finetuner_args.main_process_first(desc="dataset map tokenization"):
204
+ tokenized_dataset = model.tokenize(dataset)
205
+ lm_dataset = self.group_text(
206
+ tokenized_dataset,
207
+ model_max_length=model.get_max_length(),
208
+ )
209
+
210
+ train_dataset = lm_dataset.get_backend_dataset()
211
+
212
+ if finetuner_args.do_train:
213
+ if data_args.max_train_samples is not None:
214
+ max_train_samples = min(len(train_dataset), data_args.max_train_samples)
215
+ train_dataset = train_dataset.select(range(max_train_samples))
216
+
217
+ # Initialize our Trainer
218
+ training_args = finetuner_args
219
+ trainer = Trainer(
220
+ model=model.get_backend_model(),
221
+ args=training_args,
222
+ train_dataset=train_dataset if training_args.do_train else None,
223
+ eval_dataset=None,
224
+ tokenizer=model.get_tokenizer(),
225
+ # Data collator will default to DataCollatorWithPadding, so we change it.
226
+ data_collator=default_data_collator,
227
+ compute_metrics=None,
228
+ preprocess_logits_for_metrics=None,
229
+ )
230
+
231
+ # Training
232
+ if training_args.do_train:
233
+ checkpoint = None
234
+ last_checkpoint = self.last_checkpoint
235
+ if training_args.resume_from_checkpoint is not None:
236
+ checkpoint = training_args.resume_from_checkpoint
237
+ elif last_checkpoint is not None:
238
+ checkpoint = last_checkpoint
239
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
240
+
241
+ if not model_args.use_lora:
242
+ trainer.save_model() # Saves the tokenizer too for easy upload
243
+ else:
244
+ if model_args.save_aggregated_lora:
245
+ model.merge_lora_weights()
246
+ model.save(finetuner_args.output_dir,model_args.save_aggregated_lora)
247
+
248
+ metrics = train_result.metrics
249
+
250
+ max_train_samples = (
251
+ data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
252
+ )
253
+ metrics["train_samples"] = min(max_train_samples, len(train_dataset))
254
+
255
+ trainer.log_metrics("train", metrics)
256
+ trainer.save_metrics("train", metrics)
257
+ trainer.save_state()
258
+
259
+ kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
260
+ if data_args.dataset_name is not None:
261
+ kwargs["dataset_tags"] = data_args.dataset_name
262
+ if data_args.dataset_config_name is not None:
263
+ kwargs["dataset_args"] = data_args.dataset_config_name
264
+ kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
265
+ else:
266
+ kwargs["dataset"] = data_args.dataset_name
267
+
268
+ if training_args.push_to_hub:
269
+ trainer.push_to_hub(**kwargs)
270
+ else:
271
+ trainer.create_model_card(**kwargs)
272
+
273
+ return model
lmflow/pipeline/inferencer.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """The Inferencer class simplifies the process of model inferencing."""
4
+
5
+ import os
6
+ import torch
7
+ import wandb
8
+ import deepspeed
9
+ import sys
10
+ import numpy as np
11
+ import datetime
12
+ import json
13
+
14
+ from transformers import AutoConfig
15
+ import torch.distributed as dist
16
+
17
+ from lmflow.args import DatasetArguments
18
+ from lmflow.datasets.dataset import Dataset
19
+ from lmflow.pipeline.base_pipeline import BasePipeline
20
+ from lmflow.models.hf_decoder_model import HFDecoderModel
21
+ from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction
22
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # To avoid warnings about parallelism in tokenizers
23
+
24
+ def rstrip_partial_utf8(string):
25
+ return string.replace("\ufffd", "")
26
+
27
+ class Inferencer(BasePipeline):
28
+ """
29
+ Initializes the `Inferencer` class with given arguments.
30
+
31
+ Parameters
32
+ ------------
33
+ model_args : ModelArguments object.
34
+ Contains the arguments required to load the model.
35
+
36
+ data_args : DatasetArguments object.
37
+ Contains the arguments required to load the dataset.
38
+
39
+ inferencer_args : InferencerArguments object.
40
+ Contains the arguments required to perform inference.
41
+
42
+
43
+ """
44
+ def __init__(self, model_args, data_args, inferencer_args):
45
+ self.data_args = data_args
46
+ self.inferencer_args = inferencer_args
47
+ self.model_args = model_args
48
+
49
+ set_random_seed(self.inferencer_args.random_seed)
50
+
51
+ self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
52
+ self.world_size = int(os.getenv("WORLD_SIZE", "1"))
53
+ if inferencer_args.device == "gpu":
54
+ torch.cuda.set_device(self.local_rank) # NOTE: cpu-only machine will have error
55
+ deepspeed.init_distributed()
56
+ else:
57
+ os.environ["MASTER_ADDR"] = "localhost"
58
+ os.environ["MASTER_PORT"] = "15000"
59
+ dist.init_process_group(
60
+ "gloo", rank=self.local_rank, world_size=self.world_size
61
+ )
62
+
63
+ self.config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
64
+ try:
65
+ self.model_hidden_size = self.config.hidden_size
66
+ except:
67
+ print("Error in setting hidden size, use the default size 1024")
68
+ self.model_hidden_size = 1024 # gpt2 seems do not have hidden_size in config
69
+
70
+
71
+ def create_dataloader(self, dataset: Dataset):
72
+ data_dict = dataset.to_dict()
73
+ inputs = [ instance["text"] for instance in data_dict["instances"] ]
74
+ dataset_size = len(inputs)
75
+ dataset_buf = []
76
+ for idx in range(dataset_size):
77
+ dataset_buf.append({
78
+ "input": inputs[idx],
79
+ "input_idx": idx
80
+ })
81
+
82
+ dataloader = batchlize(
83
+ dataset_buf,
84
+ batch_size=1,
85
+ random_shuffle=False,
86
+ )
87
+ return dataloader, dataset_size
88
+
89
+
90
+ def inference(
91
+ self,
92
+ model,
93
+ dataset: Dataset,
94
+ max_new_tokens: int=100,
95
+ temperature: float=0.0,
96
+ prompt_structure: str='{input}',
97
+ ):
98
+ """
99
+ Perform inference for a model
100
+
101
+ Parameters
102
+ ------------
103
+ model : TunableModel object.
104
+ TunableModel to perform inference
105
+
106
+ dataset : Dataset object.
107
+
108
+
109
+ Returns:
110
+
111
+ output_dataset: Dataset object.
112
+ """
113
+ if dataset.get_type() != "text_only":
114
+ raise NotImplementedError(
115
+ 'input dataset should have type "text_only"'
116
+ )
117
+
118
+ dataloader, data_size = self.create_dataloader(dataset)
119
+
120
+ # The output dataset
121
+ output_dict = {
122
+ "type": "text_only",
123
+ "instances": [
124
+ ]
125
+ }
126
+
127
+ for batch_index, batch in enumerate(dataloader):
128
+ current_batch = batch[0] # batch size is 1
129
+
130
+ input = prompt_structure.format(input=current_batch['input'])
131
+
132
+ if self.inferencer_args.device == "gpu":
133
+ inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
134
+ elif self.inferencer_args.device == "cpu":
135
+ inputs = model.encode(input, return_tensors="pt").to(device='cpu')
136
+ else:
137
+ raise NotImplementedError(
138
+ f"device \"{self.inferencer_args.device}\" is not supported"
139
+ )
140
+
141
+ outputs = model.inference(
142
+ inputs,
143
+ max_new_tokens=max_new_tokens,
144
+ temperature=temperature,
145
+ repetition_penalty=1.0,
146
+ )
147
+ text_out = model.decode(outputs[0], skip_special_tokens=True)
148
+
149
+ # only return the generation, trucating the input
150
+ prompt_length = len(model.decode(inputs[0], skip_special_tokens=True,))
151
+ text_out = text_out[prompt_length:]
152
+ output_dict["instances"].append({ "text": text_out })
153
+
154
+ output_dataset = Dataset(DatasetArguments(dataset_path = None))
155
+ output_dataset = output_dataset.from_dict(output_dict)
156
+
157
+ return output_dataset
158
+
159
+ def stream_inference(self, context, model, max_new_tokens, token_per_step, temperature, end_string, input_dataset):
160
+ response = ""
161
+ history = []
162
+ if "ChatGLMModel" in self.config.architectures:
163
+ for response, history in model.get_backend_model().stream_chat(model.get_tokenizer(), context, history=history):
164
+ response = rstrip_partial_utf8(response)
165
+ yield response, False
166
+ else:
167
+ for _ in range(0, max_new_tokens // token_per_step):
168
+ output_dataset = self.inference(
169
+ model=model,
170
+ dataset=input_dataset,
171
+ max_new_tokens=token_per_step,
172
+ temperature=temperature,
173
+ )
174
+
175
+ new_append_text = output_dataset.to_dict()["instances"][0]["text"]
176
+ new_append_text = rstrip_partial_utf8(new_append_text)
177
+ response += new_append_text
178
+
179
+ input_dict = input_dataset.to_dict()
180
+ input_dict["instances"][0]["text"] += new_append_text
181
+
182
+ input_dataset = input_dataset.from_dict(input_dict)
183
+
184
+ flag_break = False
185
+ try:
186
+ index = response.index(end_string)
187
+ flag_break = True
188
+ except ValueError:
189
+ response += end_string
190
+ index = response.index(end_string)
191
+
192
+ response = response[:index]
193
+
194
+ yield response, flag_break
lmflow/pipeline/raft_aligner.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ The Aligner class simplifies the process of running alignment.
5
+ """
6
+
7
+ import logging
8
+ import numpy as np
9
+ import os
10
+ import sys
11
+ import time
12
+ from itertools import chain
13
+
14
+ import torch
15
+ import torch.distributed as dist
16
+ import transformers
17
+ from datasets import (
18
+ set_caching_enabled,
19
+ Dataset,
20
+ DatasetDict,
21
+ )
22
+ from transformers import (
23
+ default_data_collator,
24
+ pipeline,
25
+ set_seed,
26
+ )
27
+ from transformers.testing_utils import CaptureLogger
28
+
29
+ from lmflow.args import DatasetArguments
30
+ from lmflow.datasets.dataset import Dataset as LMFlowDataset
31
+ from lmflow.pipeline.base_aligner import BaseAligner
32
+ from lmflow.pipeline.utils.raft_trainer import RaftTrainer
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class RaftAligner(BaseAligner):
38
+ """
39
+ Initializes the `RaftAligner` class with given arguments.
40
+
41
+ Parameters
42
+ ------------
43
+ model_args : ModelArguments object.
44
+ Contains the arguments required to load the model.
45
+
46
+ data_args : DatasetArguments object.
47
+ Contains the arguments required to load the dataset.
48
+
49
+ raft_aligner_args : RaftAlignerArguments object.
50
+ Contains the arguments required to perform alignment.
51
+
52
+ args : Optional.
53
+ Positional arguments.
54
+
55
+ kwargs : Optional.
56
+ Keyword arguments.
57
+
58
+ """
59
+ def __init__(self, model_args, data_args, aligner_args, *args, **kwargs):
60
+ self.model_args = model_args
61
+ self.data_args = data_args
62
+ self.aligner_args = aligner_args
63
+
64
+ logging.basicConfig(
65
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
66
+ datefmt="%m/%d/%Y %H:%M:%S",
67
+ handlers=[logging.StreamHandler(sys.stdout)],
68
+ )
69
+
70
+ logger.setLevel(logging.INFO)
71
+
72
+ output_reward_path = aligner_args.output_reward_path
73
+ if output_reward_path is not None:
74
+ os.makedirs(os.path.dirname(output_reward_path), exist_ok=True)
75
+ # Deletes a maybe-exist file
76
+ try:
77
+ os.remove(output_reward_path)
78
+ except OSError:
79
+ pass
80
+
81
+
82
+ def _initialize_trainer(self, model, tokenizer, training_args):
83
+ """
84
+ This function takes the model and tokenizer as the input and initialize the trainer.
85
+ """
86
+ trainer = RaftTrainer(
87
+ model=model,
88
+ args=training_args,
89
+ train_dataset=Dataset.from_dict({"text": [ " " ] }),
90
+ eval_dataset=Dataset.from_dict({}),
91
+ tokenizer=tokenizer,
92
+ data_collator=default_data_collator,
93
+ compute_metrics=None,
94
+ preprocess_logits_for_metrics=None,
95
+ )
96
+ return trainer
97
+
98
+
99
+ def _load_dataset(
100
+ self,
101
+ selected_dataset,
102
+ model,
103
+ tokenizer,
104
+ model_args,
105
+ data_args,
106
+ training_args,
107
+ ):
108
+ '''
109
+ This function prepares the dataset for every iteration.
110
+ '''
111
+ raw_datasets = selected_dataset
112
+
113
+ if training_args.do_train:
114
+ column_names = list(raw_datasets["train"].features)
115
+ else:
116
+ column_names = list(raw_datasets["validation"].features)
117
+ text_column_name = "text" if "text" in column_names else column_names[0]
118
+
119
+ # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
120
+ tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
121
+
122
+ def tokenize_function(examples):
123
+ with CaptureLogger(tok_logger) as cl:
124
+ output = tokenizer(examples[text_column_name])
125
+ # clm input could be much much longer than block_size
126
+ if "Token indices sequence length is longer than the" in cl.out:
127
+ tok_logger.warning(
128
+ "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
129
+ " before being passed to the model."
130
+ )
131
+ return output
132
+
133
+ with training_args.main_process_first(desc="dataset map tokenization"):
134
+ if not data_args.streaming:
135
+ tokenized_datasets = raw_datasets.map(
136
+ tokenize_function,
137
+ batched=True,
138
+ num_proc=data_args.preprocessing_num_workers,
139
+ remove_columns=column_names,
140
+ load_from_cache_file=not data_args.overwrite_cache,
141
+ desc="Running tokenizer on dataset",
142
+ )
143
+ else:
144
+ tokenized_datasets = raw_datasets.map(
145
+ tokenize_function,
146
+ batched=True,
147
+ remove_columns=column_names,
148
+ )
149
+
150
+ if data_args.block_size is None:
151
+ block_size = tokenizer.model_max_length
152
+ if block_size > 1024:
153
+ logger.warning(
154
+ "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
155
+ " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
156
+ " override this default with `--block_size xxx`."
157
+ )
158
+ block_size = 512
159
+ else:
160
+ if data_args.block_size > tokenizer.model_max_length:
161
+ logger.warning(
162
+ f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
163
+ f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
164
+ )
165
+ block_size = min(data_args.block_size, tokenizer.model_max_length)
166
+
167
+ # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
168
+ def group_texts(examples):
169
+ # Concatenate all texts.
170
+ concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
171
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
172
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
173
+ # customize this part to your needs.
174
+ if total_length >= block_size:
175
+ total_length = (total_length // block_size) * block_size
176
+ # Split by chunks of max_len.
177
+ result = {
178
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
179
+ for k, t in concatenated_examples.items()
180
+ }
181
+ result["labels"] = result["input_ids"].copy()
182
+ return result
183
+
184
+ # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
185
+ # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
186
+ # to preprocess.
187
+ #
188
+ # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
189
+ # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
190
+
191
+ with training_args.main_process_first(desc="grouping texts together"):
192
+ group_batch_size = 1000
193
+ if data_args.disable_group_texts:
194
+ group_batch_size = 1
195
+ if not data_args.streaming:
196
+ lm_datasets = tokenized_datasets.map(
197
+ group_texts,
198
+ batched=True,
199
+ batch_size=group_batch_size,
200
+ num_proc=data_args.preprocessing_num_workers,
201
+ load_from_cache_file=not data_args.overwrite_cache,
202
+ desc=f"Grouping texts in chunks of {block_size}",
203
+ )
204
+ else:
205
+ lm_datasets = tokenized_datasets.map(
206
+ group_texts,
207
+ batched=True,
208
+ batch_size=group_batch_size,
209
+ )
210
+
211
+ if training_args.do_train:
212
+ if "train" not in tokenized_datasets:
213
+ raise ValueError("--do_train requires a train dataset")
214
+ train_dataset = lm_datasets["train"]
215
+ if data_args.max_train_samples is not None:
216
+ max_train_samples = min(len(train_dataset), data_args.max_train_samples)
217
+ train_dataset = train_dataset.select(range(max_train_samples))
218
+
219
+ return train_dataset
220
+
221
+
222
+ def _load_input_dataset(self, dataset, tokenizer):
223
+ """
224
+ Load input dataset (i.e. prompt/question dataset) for training.
225
+
226
+ Args:
227
+ dataset: A Dataset object.
228
+ The dataset to be loaded.
229
+
230
+ Returns:
231
+ dataloader (`torch.utils.data.DataLoader`):
232
+ The dataloader for the dataset.
233
+ """
234
+ ds = dataset.get_backend_dataset()
235
+
236
+ def tokenize(sample):
237
+ input_size = 16
238
+ review_encode = tokenizer.encode(sample["text"])
239
+ sample["input_ids"] = review_encode[:input_size]
240
+ sample['input'] = tokenizer.decode(sample["input_ids"])
241
+ return sample
242
+
243
+ ds = ds.map(tokenize, batched=False)
244
+ ds.set_format(type='torch')
245
+
246
+ return ds
247
+
248
+
249
+ def _get_batch_dataset_top(
250
+ self,
251
+ model,
252
+ batch_input,
253
+ alpha=0.2,
254
+ iter_id=0,
255
+ local_rank=0,
256
+ output_min_length=16,
257
+ output_max_length=48,
258
+ infer_batch_size=8,
259
+ generation_kwargs={},
260
+ tokenizer=None,
261
+ training_args=None,
262
+ reward_model=None,
263
+ output_reward_path=None,
264
+ ):
265
+ """
266
+ :param batch_input: input prompts
267
+ """
268
+ # we will get the batch dataset via Dataset.from_dict
269
+ start_time = time.time()
270
+ output_data = []
271
+ query_tensors = batch_input['input_ids']
272
+ querys = batch_input['input']
273
+ data_size = len(querys)
274
+ cnt = 0
275
+ reward_eva = []
276
+ reward_train = []
277
+ out_put_dataset_eval = {}
278
+ data_eval = []
279
+ input_texts = []
280
+ responses = []
281
+ for i, query_tensor in enumerate(query_tensors):
282
+ query = querys[i]
283
+ input_texts.append(query)
284
+ if (i + 1) % infer_batch_size == 0:
285
+ gen_len = np.random.randint(output_min_length, output_max_length)
286
+ generation_kwargs["max_new_tokens"] = gen_len
287
+ inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(training_args.device)
288
+ with torch.no_grad():
289
+ outputs = model.generate(**inputs, **generation_kwargs)
290
+ generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
291
+ generated_texts = [
292
+ generated_text.replace(input_texts[i], "") for i, generated_text in enumerate(generated_texts)
293
+ ]
294
+ texts_for_rewards = [q + r for q, r in zip(input_texts, generated_texts)]
295
+
296
+ texts_for_reward_dataset = LMFlowDataset.create_from_dict({
297
+ "type": "text_only",
298
+ "instances": [
299
+ { "text": text } for text in texts_for_rewards
300
+ ],
301
+ })
302
+
303
+ reward_dataset = reward_model.inference(texts_for_reward_dataset)
304
+ rewards = [ sample["value"] for sample in reward_dataset.to_dict()["instances"] ]
305
+
306
+ reward_eva.extend(rewards)
307
+ responses.extend(generated_texts)
308
+ input_texts = []
309
+
310
+ data = []
311
+ idx = np.argsort(reward_eva)[::-1][:int(data_size * alpha)]
312
+ for j in range(len(reward_eva)):
313
+ sample = {}
314
+ sample["input"] = querys[j]
315
+ sample["output"] = [responses[j]]
316
+ data.append(sample)
317
+ output_data = [data[j] for j in idx]
318
+ logger.info(f"collected data of {len(output_data)}")
319
+
320
+ world_size = int(os.getenv("WORLD_SIZE", "1"))
321
+ all_process_list =[{}] * world_size
322
+ dist.all_gather_object(all_process_list, output_data)
323
+
324
+ gathered_data = []
325
+ for i in range(world_size):
326
+ gathered_data.extend(all_process_list[i])
327
+
328
+ reward_train = [reward_eva[j] for j in idx]
329
+
330
+ reward_to_send = [np.mean(reward_eva), np.mean(reward_train)]
331
+ all_process_rewards = [{}] * world_size
332
+ dist.all_gather_object(all_process_rewards, reward_to_send)
333
+ logger.info(all_process_rewards)
334
+
335
+ if training_args.local_rank == 0 and output_reward_path is not None:
336
+ with open(output_reward_path, mode='a') as fout:
337
+ fout.write('mean reward: ' + str(np.mean([all_process_rewards[i][0] for i in range(world_size)])) + 'mean reward in training set: ' + str([all_process_rewards[i][1] for i in range(world_size)]))
338
+ fout.write("\n")
339
+
340
+ prompt_structure = "{definition}{input}{output}"
341
+ output_dataset = {
342
+ "text": [ prompt_structure.format(
343
+ definition="", input=sample["input"], output=sample["output"][0]
344
+ ) for sample in gathered_data
345
+ ]
346
+ }
347
+
348
+ return DatasetDict({ "train": Dataset.from_dict(output_dataset) })
349
+
350
+
351
+ def align(self, model, dataset, reward_model):
352
+ """
353
+ Perform alignment for a model
354
+
355
+ Parameters
356
+ ------------
357
+ model : BaseModel object.
358
+ dataset: Dataset object.
359
+ Input dataset for model to generate outputs. The input and output
360
+ will then be feed into reward model to get the reward for
361
+ alignment.
362
+ reward_model: RegressionModel object.
363
+ """
364
+ tokenizer = model.get_tokenizer()
365
+ tokenizer.pad_token = tokenizer.eos_token
366
+ tokenizer.pad_token_id = tokenizer.eos_token_id
367
+ tokenizer.padding_side = "left"
368
+
369
+ dataset = self._load_input_dataset(dataset, tokenizer)
370
+ set_caching_enabled(False)
371
+
372
+ wrapped_model = model
373
+ model = model.get_backend_model()
374
+
375
+ generation_kwargs = {
376
+ "min_length": -1,
377
+ "top_k": 0.0,
378
+ "top_p": 1.0,
379
+ "do_sample": True,
380
+ "pad_token_id": tokenizer.eos_token_id,
381
+ "temperature":0.7
382
+ }
383
+
384
+ aligner_args = self.aligner_args
385
+ training_args = aligner_args
386
+ model_args = self.model_args
387
+ data_args = self.data_args
388
+
389
+ set_seed(42 + training_args.local_rank)
390
+
391
+ ITERATION = aligner_args.num_raft_iteration
392
+ M = aligner_args.raft_batch_size
393
+
394
+ alpha = aligner_args.top_reward_percentage
395
+ data_size = len(dataset['input'])
396
+ reward_seq = []
397
+ lr = training_args.learning_rate
398
+
399
+ raft_trainer = self._initialize_trainer(model, tokenizer, training_args)
400
+ raft_trainer.train(resume_from_checkpoint=False, is_first_time=True)
401
+
402
+ ##############
403
+ for iteration in range(ITERATION):
404
+ set_seed(88 + training_args.local_rank + 4 * (iteration+1))
405
+
406
+ batch_input = dataset.select(np.random.randint(low=0, high=data_size, size=M))
407
+
408
+ selected_dataset = self._get_batch_dataset_top(
409
+ raft_trainer.tmp_model,
410
+ batch_input,
411
+ alpha,
412
+ iteration,
413
+ training_args.local_rank,
414
+ output_min_length=aligner_args.output_min_length,
415
+ output_max_length=aligner_args.output_max_length,
416
+ infer_batch_size=aligner_args.inference_batch_size_per_device,
417
+ generation_kwargs=generation_kwargs,
418
+ tokenizer=tokenizer,
419
+ training_args=training_args,
420
+ reward_model=reward_model,
421
+ output_reward_path=aligner_args.output_reward_path,
422
+ )
423
+ raft_trainer.train_dataset = self._load_dataset(
424
+ selected_dataset,
425
+ raft_trainer.tmp_model,
426
+ tokenizer,
427
+ model_args,
428
+ data_args,
429
+ training_args,
430
+ )
431
+
432
+ logger.info(f"iter {iteration}")
433
+ start_time = time.time()
434
+ train_result = raft_trainer.train(resume_from_checkpoint=False)
435
+ end_time = time.time()
436
+ logger.info("It takes %.2f s to train one stage", end_time - start_time)
437
+
438
+ self._get_batch_dataset_top(
439
+ raft_trainer.tmp_model,
440
+ batch_input, alpha,
441
+ iteration,
442
+ training_args.local_rank,
443
+ output_min_length=aligner_args.output_min_length,
444
+ output_max_length=aligner_args.output_max_length,
445
+ infer_batch_size=aligner_args.inference_batch_size_per_device,
446
+ generation_kwargs=generation_kwargs,
447
+ tokenizer=tokenizer,
448
+ training_args=training_args,
449
+ reward_model=reward_model,
450
+ output_reward_path=aligner_args.output_reward_path,
451
+ )
452
+
453
+ if aligner_args.output_dir is not None:
454
+ wrapped_model.save(aligner_args.output_dir)
455
+
456
+ return wrapped_model
lmflow/pipeline/utils/__init__.py ADDED
File without changes
lmflow/pipeline/utils/raft_trainer.py ADDED
The diff for this file is too large to render. See raw diff
 
lmflow/utils/__init__.py ADDED
File without changes
lmflow/utils/constants.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ Commonly used constants.
5
+ """
6
+
7
+ TEXT_ONLY_DATASET_DESCRIPTION = (
8
+ """
9
+ "text_only": a dataset with only raw text instances, with following format:
10
+
11
+ {
12
+ "type": "text_only",
13
+ "instances": [
14
+ { "text": "TEXT_1" },
15
+ { "text": "TEXT_2" },
16
+ ...
17
+ ]
18
+ }
19
+ """
20
+ ).lstrip("\n")
21
+
22
+
23
+ TEXT_ONLY_DATASET_DETAILS = (
24
+ """
25
+ For example,
26
+
27
+ ```python
28
+ from lmflow.datasets import Dataset
29
+
30
+ data_dict = {
31
+ "type": "text_only",
32
+ "instances": [
33
+ { "text": "Human: Hello. Bot: Hi!" },
34
+ { "text": "Human: How are you today? Bot: Fine, thank you!" },
35
+ ]
36
+ }
37
+ dataset = Dataset.create_from_dict(data_dict)
38
+ ```
39
+
40
+ You may also save the corresponding format to json,
41
+ ```python
42
+ import json
43
+ from lmflow.args import DatasetArguments
44
+ from lmflow.datasets import Dataset
45
+
46
+ data_dict = {
47
+ "type": "text_only",
48
+ "instances": [
49
+ { "text": "Human: Hello. Bot: Hi!" },
50
+ { "text": "Human: How are you today? Bot: Fine, thank you!" },
51
+ ]
52
+ }
53
+ with open("data.json", "w") as fout:
54
+ json.dump(data_dict, fout)
55
+
56
+ data_args = DatasetArgument(dataset_path="data.json")
57
+ dataset = Dataset(data_args)
58
+ new_data_dict = dataset.to_dict()
59
+ # `new_data_dict` Should have the same content as `data_dict`
60
+ ```
61
+ """
62
+ ).lstrip("\n")
63
+
64
+
65
+ TEXT2TEXT_DATASET_DESCRIPTION = (
66
+ """
67
+ "text2text": a dataset with input & output instances, with following format:
68
+
69
+ {
70
+ "type": "text2text",
71
+ "instances": [
72
+ { "input": "INPUT_1", "output": "OUTPUT_1" },
73
+ { "input": "INPUT_2", "output": "OUTPUT_2" },
74
+ ...
75
+ ]
76
+ }
77
+ """
78
+ ).lstrip("\n")
79
+
80
+
81
+ TEXT2TEXT_DATASET_DETAILS = (
82
+ """
83
+ For example,
84
+
85
+ ```python
86
+ from lmflow.datasets import Dataset
87
+
88
+ data_dict = {
89
+ "type": "text2text",
90
+ "instances": [
91
+ {
92
+ "input": "Human: Hello.",
93
+ "output": "Bot: Hi!",
94
+ },
95
+ {
96
+ "input": "Human: How are you today?",
97
+ "output": "Bot: Fine, thank you! And you?",
98
+ }
99
+ ]
100
+ }
101
+ dataset = Dataset.create_from_dict(data_dict)
102
+ ```
103
+
104
+ You may also save the corresponding format to json,
105
+ ```python
106
+ import json
107
+ from lmflow.args import DatasetArguments
108
+ from lmflow.datasets import Dataset
109
+
110
+ data_dict = {
111
+ "type": "text2text",
112
+ "instances": [
113
+ {
114
+ "input": "Human: Hello.",
115
+ "output": "Bot: Hi!",
116
+ },
117
+ {
118
+ "input": "Human: How are you today?",
119
+ "output": "Bot: Fine, thank you! And you?",
120
+ }
121
+ ]
122
+ }
123
+ with open("data.json", "w") as fout:
124
+ json.dump(data_dict, fout)
125
+
126
+ data_args = DatasetArgument(dataset_path="data.json")
127
+ dataset = Dataset(data_args)
128
+ new_data_dict = dataset.to_dict()
129
+ # `new_data_dict` Should have the same content as `data_dict`
130
+ ```
131
+ """
132
+ ).lstrip("\n")
133
+
134
+
135
+ TEXT_ONLY_DATASET_LONG_DESCRITION = (
136
+ TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS
137
+ )
138
+
139
+ TEXT2TEXT_DATASET_LONG_DESCRITION = (
140
+ TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS
141
+ )
lmflow/utils/data_utils.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The program includes several functions: setting a random seed,
2
+ loading data from a JSON file, batching data, and extracting answers from generated text.
3
+ """
4
+
5
+ import random
6
+ import numpy as np
7
+ import torch
8
+ import json
9
+ import re
10
+ def set_random_seed(seed: int):
11
+ """
12
+ Set the random seed for `random`, `numpy`, `torch`, `torch.cuda`.
13
+
14
+ Parameters
15
+ ------------
16
+ seed : int
17
+ The default seed.
18
+
19
+ """
20
+ random.seed(seed)
21
+ np.random.seed(seed)
22
+ torch.manual_seed(seed)
23
+ if torch.cuda.is_available():
24
+ torch.cuda.manual_seed_all(seed)
25
+
26
+ def load_data(file_name: str):
27
+ """
28
+ Load data with file name.
29
+
30
+ Parameters
31
+ ------------
32
+ file_name : str.
33
+ The dataset file name.
34
+
35
+ Returns
36
+ ------------
37
+ inputs : list.
38
+ The input texts of the dataset.
39
+ outputs : list.
40
+ The output texts file datasets.
41
+ len : int.
42
+ The length of the dataset.
43
+ """
44
+ inputs = []
45
+ outputs = []
46
+ type = ""
47
+ with open(file_name, encoding='utf-8') as f:
48
+ json_data = json.load(f)
49
+ type = json_data["type"]
50
+ for line in json_data["instances"]:
51
+ inputs.append(line["input"])
52
+ outputs.append(line["output"])
53
+
54
+ print(f"load dataset {file_name} success.\n")
55
+ print(f"Type : {type}, datasize : {len(outputs)}")
56
+
57
+ return inputs, outputs, len(outputs)
58
+
59
+ def batchlize(examples: list, batch_size: int, random_shuffle: bool):
60
+ """
61
+ Convert examples to a dataloader.
62
+
63
+ Parameters
64
+ ------------
65
+ examples : list.
66
+ Data list.
67
+ batch_size : int.
68
+
69
+ random_shuffle : bool
70
+ If true, the dataloader shuffle the training data.
71
+
72
+ Returns
73
+ ------------
74
+ dataloader:
75
+ Dataloader with batch generator.
76
+ """
77
+ size = 0
78
+ dataloader = []
79
+ length = len(examples)
80
+ if (random_shuffle):
81
+ random.shuffle(examples)
82
+ while size < length:
83
+ if length - size > batch_size:
84
+ dataloader.append(examples[size : size+batch_size])
85
+ size += batch_size
86
+ else:
87
+ dataloader.append(examples[size : size+(length-size)])
88
+ size += (length - size)
89
+ return dataloader
90
+
91
+
92
+
93
+ def answer_extraction(response, answer_type=None): #use this funtion to extract answers from generated text
94
+
95
+ """
96
+ Use this funtion to extract answers from generated text
97
+
98
+ Parameters
99
+ ------------
100
+ args :
101
+ Arguments.
102
+ response : str
103
+ plain string response.
104
+
105
+
106
+ Returns
107
+ ------------
108
+ answer:
109
+ Decoded answer (such as A, B, C, D, E for mutiple-choice QA).
110
+ """
111
+
112
+ # temp = response["generated_text"]
113
+ temp = response
114
+ if answer_type in ("gsm8k", "svamp", "asdiv", "addsub", "singleeq", "multiarith", "math"):
115
+ temp = temp.replace(",", "")
116
+ temp = [s for s in re.findall(r'-?\d+\.?\d*', temp)]
117
+ elif answer_type in ("aqua", "csqa", "multiple_choice"):
118
+ temp = re.findall(r'A|B|C|D|E', temp)
119
+ elif answer_type in ("strategyqa", "coin_flip"):
120
+ temp = temp.lower()
121
+ temp = re.sub("\"|\'|\n|\.|\s|\:|\,"," ", temp)
122
+ temp = temp.split(" ")
123
+ temp = [i for i in temp if i in ("yes", "no")]
124
+ elif answer_type in ("last_letters"):
125
+ temp = re.sub("\"|\'|\n|\.|\s","", temp)
126
+ temp = [temp]
127
+ elif answer_type in ("pubmedqa", "binary_choice"):
128
+ # pattern = "Output: (yes|no|maybe)"
129
+ # sttr = re.search(pattern, temp)
130
+ # answer = sttr.group(0)[8:] if sttr is not None else "N/A"
131
+ pattern = "(answer|Answer|ANSWER|output|Output|OUTPUT|A): \(*(yes|Yes|YES|no|No|NO|maybe|Maybe|MAYBE)"
132
+ sttr = re.search(pattern, temp)
133
+ if sttr is not None:
134
+ mid_answer = sttr.group(0)
135
+ mid_answer = mid_answer.split(":")[-1].strip()
136
+ answer = mid_answer.lower()
137
+ else:
138
+ pattern = "(yes|Yes|YES|no|No|NO|maybe|Maybe|MAYBE)(\.|\s)"
139
+ sttr = re.search(pattern, temp)
140
+ if sttr is not None:
141
+ answer = sttr.group(0)[:-1].lower()
142
+ else:
143
+ answer = "N/A"
144
+ return answer
145
+ elif answer_type == "medmcqa":
146
+ # pattern = "Output: (A|B|C|D)."
147
+ # sttr = re.search(pattern, temp)
148
+ # answer = sttr.group(0)[8:-1].lower() if sttr is not None else "N/A"
149
+ pattern = "(answer|Answer|ANSWER|output|Output|OUTPUT|A): \(*(A|B|C|D|a|b|c|d)"
150
+ sttr = re.search(pattern, temp)
151
+ if sttr is not None:
152
+ mid_answer = sttr.group(0)
153
+ answer = mid_answer[-1].lower()
154
+ else:
155
+ pattern = "\(*(A|B|C|D|a|b|c|d)\)*(\.|\s)"
156
+ sttr = re.search(pattern, temp)
157
+ if sttr is not None:
158
+ if '(' in sttr.group(0):
159
+ answer = sttr.group(0)[1].lower()
160
+ else:
161
+ answer = sttr.group(0)[0].lower()
162
+ else:
163
+ answer = "N/A"
164
+ return answer
165
+
166
+ elif answer_type == "usmle":
167
+ # pattern = "Output: (A|B|C|D)."
168
+ # sttr = re.search(pattern, temp)
169
+ # answer = sttr.group(0)[8:-1].lower() if sttr is not None else "N/A"
170
+ pattern = "(Answer|Output|A): \(*(A|B|C|D|a|b|c|d)"
171
+ sttr = re.search(pattern, temp)
172
+ if sttr is not None:
173
+ mid_answer = sttr.group(0)
174
+ answer = mid_answer[-1].lower()
175
+ else:
176
+ pattern = "\(*(A|B|C|D|a|b|c|d)\)*(\.|\s)"
177
+ sttr = re.search(pattern, temp)
178
+ if sttr is not None:
179
+ if '(' in sttr.group(0):
180
+ answer = sttr.group(0)[1].lower()
181
+ else:
182
+ answer = sttr.group(0)[0].lower()
183
+ else:
184
+ answer = "N/A"
185
+ return answer
186
+ elif answer_type == "text":
187
+ return response
188
+ else:
189
+ raise NotImplementedError(f"Unsupported answer type: {answer_type}")
190
+
191
+ if len(temp) != 0:
192
+ answer = temp[-1]
193
+ # if there is . at the end of answer, remove it
194
+ # e.g. answer = 64.
195
+ if answer != "":
196
+ if answer[-1] == ".":
197
+ answer = answer[:-1]
198
+
199
+ # round the answer to nearest integer
200
+ if answer_type in ("gsm8k", "svamp"):
201
+ try:
202
+ answer = str(round(float(answer)))
203
+ except:
204
+ answer = "" # no sol or sol doesn't have valid format
205
+ elif answer_type in ("last_letters"):
206
+ try:
207
+ answer = answer[-args.concat_length:]
208
+ except:
209
+ answer = ""
210
+ else:
211
+ answer = ""
212
+ return answer
lmflow/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "0.0.1"
requirements.txt CHANGED
@@ -1 +1,13 @@
1
- lmflow @ git+https://github.com/OptimalScale/LMFlow.git@c21a511c8abcb2bc9fba3ae4de847806688bdd3d
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.24.2
2
+ datasets==2.10.1
3
+ peft @ git+https://github.com/huggingface/peft.git@deff03f2c251534fffd2511fc2d440e84cc54b1b
4
+ torch==2.0.0
5
+ wandb==0.14.0
6
+ deepspeed==0.8.3
7
+ trl @ git+https://github.com/lvwerra/trl.git#egg=trl-0.4.1
8
+ sentencepiece
9
+ transformers @ git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda
10
+ flask
11
+ flask_cors
12
+ icetk
13
+ cpm_kernels==1.0.11