sdyy commited on
Commit
96601f8
·
verified ·
1 Parent(s): 1dffc00

Upload 3 files

Browse files
Files changed (3) hide show
  1. autotrain_llm.py +97 -0
  2. finetune.py +63 -0
  3. script.py +60 -0
autotrain_llm.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """AutoTrain_LLM.ipynb
3
+ Automatically generated by Colab.
4
+ Original file is located at
5
+ https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/colabs/AutoTrain_LLM.ipynb
6
+ """
7
+
8
+ #@title 🤗 AutoTrain LLM
9
+ #@markdown In order to use this colab
10
+ #@markdown - upload train.csv to a folder named `data/`
11
+ #@markdown - train.csv must contain a `text` column
12
+ #@markdown - choose a project name if you wish
13
+ #@markdown - change model if you wish, you can use most of the text-generation models from Hugging Face Hub
14
+ #@markdown - add huggingface information (token) if you wish to push trained model to huggingface hub
15
+ #@markdown - update hyperparameters if you wish
16
+ #@markdown - click `Runtime > Run all` or run each cell individually
17
+ #@markdown - report issues / feature requests here: https://github.com/huggingface/autotrain-advanced/issues
18
+
19
+ import os
20
+ !pip install -U autotrain-advanced > install_logs.txt
21
+ !autotrain setup --colab > setup_logs.txt
22
+
23
+ #@markdown ---
24
+ #@markdown #### Project Config
25
+ #@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.
26
+ project_name = 'my-autotrain-llm' # @param {type:"string"}
27
+ model_name = 'abhishek/llama-2-7b-hf-small-shards' # @param {type:"string"}
28
+
29
+ #@markdown ---
30
+ #@markdown #### Push to Hub?
31
+ #@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account
32
+ #@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.
33
+ #@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.
34
+ #@markdown You can find your token here: https://huggingface.co/settings/tokens
35
+ push_to_hub = False # @param ["False", "True"] {type:"raw"}
36
+ hf_token = "hf_XXX" #@param {type:"string"}
37
+ hf_username = "abc" #@param {type:"string"}
38
+
39
+ #@markdown ---
40
+ #@markdown #### Hyperparameters
41
+ learning_rate = 2e-4 # @param {type:"number"}
42
+ num_epochs = 1 #@param {type:"number"}
43
+ batch_size = 1 # @param {type:"slider", min:1, max:32, step:1}
44
+ block_size = 1024 # @param {type:"number"}
45
+ trainer = "sft" # @param ["default", "sft", "orpo"] {type:"raw"}
46
+ warmup_ratio = 0.1 # @param {type:"number"}
47
+ weight_decay = 0.01 # @param {type:"number"}
48
+ gradient_accumulation = 4 # @param {type:"number"}
49
+ mixed_precision = "fp16" # @param ["fp16", "bf16", "none"] {type:"raw"}
50
+ peft = True # @param ["False", "True"] {type:"raw"}
51
+ quantization = "int4" # @param ["int4", "int8", "none"] {type:"raw"}
52
+ lora_r = 16 #@param {type:"number"}
53
+ lora_alpha = 32 #@param {type:"number"}
54
+ lora_dropout = 0.05 #@param {type:"number"}
55
+
56
+ os.environ["PROJECT_NAME"] = project_name
57
+ os.environ["MODEL_NAME"] = model_name
58
+ os.environ["PUSH_TO_HUB"] = str(push_to_hub)
59
+ os.environ["HF_TOKEN"] = hf_token
60
+ os.environ["LEARNING_RATE"] = str(learning_rate)
61
+ os.environ["NUM_EPOCHS"] = str(num_epochs)
62
+ os.environ["BATCH_SIZE"] = str(batch_size)
63
+ os.environ["BLOCK_SIZE"] = str(block_size)
64
+ os.environ["WARMUP_RATIO"] = str(warmup_ratio)
65
+ os.environ["WEIGHT_DECAY"] = str(weight_decay)
66
+ os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
67
+ os.environ["MIXED_PRECISION"] = str(mixed_precision)
68
+ os.environ["PEFT"] = str(peft)
69
+ os.environ["QUANTIZATION"] = str(quantization)
70
+ os.environ["LORA_R"] = str(lora_r)
71
+ os.environ["LORA_ALPHA"] = str(lora_alpha)
72
+ os.environ["LORA_DROPOUT"] = str(lora_dropout)
73
+ os.environ["HF_USERNAME"] = hf_username
74
+ os.environ["TRAINER"] = trainer
75
+
76
+ !autotrain llm \
77
+ --train \
78
+ --model ${MODEL_NAME} \
79
+ --project-name ${PROJECT_NAME} \
80
+ --data-path data/ \
81
+ --text-column text \
82
+ --lr ${LEARNING_RATE} \
83
+ --batch-size ${BATCH_SIZE} \
84
+ --epochs ${NUM_EPOCHS} \
85
+ --block-size ${BLOCK_SIZE} \
86
+ --warmup-ratio ${WARMUP_RATIO} \
87
+ --lora-r ${LORA_R} \
88
+ --lora-alpha ${LORA_ALPHA} \
89
+ --lora-dropout ${LORA_DROPOUT} \
90
+ --weight-decay ${WEIGHT_DECAY} \
91
+ --gradient-accumulation ${GRADIENT_ACCUMULATION} \
92
+ --quantization ${QUANTIZATION} \
93
+ --mixed-precision ${MIXED_PRECISION} \
94
+ --username ${HF_USERNAME} \
95
+ --trainer ${TRAINER} \
96
+ $( [[ "$PEFT" == "True" ]] && echo "--peft" ) \
97
+ $( [[ "$PUSH_TO_HUB" == "True" ]] && echo "--push-to-hub --token ${HF_TOKEN}" )
finetune.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Finetune.ipynb
3
+ Automatically generated by Colab.
4
+ Original file is located at
5
+ https://colab.research.google.com/drive/1b_AA5GHhblSKrQymYs_uYYDEqvqklfrV
6
+ """
7
+
8
+ !pip install datasets transformers[torch]
9
+
10
+ !pip install evaluate
11
+
12
+ !pip install accelerate -U
13
+
14
+ from datasets import load_dataset
15
+
16
+ dataset = load_dataset("yelp_review_full")
17
+ dataset["train"][100]
18
+
19
+ from transformers import AutoTokenizer
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
22
+
23
+
24
+ def tokenize_function(examples):
25
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
26
+
27
+
28
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
29
+
30
+ small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
31
+ small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
32
+
33
+ from transformers import AutoModelForSequenceClassification
34
+
35
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
36
+
37
+ from transformers import TrainingArguments
38
+
39
+ training_args = TrainingArguments(output_dir="test_trainer")
40
+
41
+ import numpy as np
42
+ import evaluate
43
+
44
+ metric = evaluate.load("accuracy")
45
+
46
+ def compute_metrics(eval_pred):
47
+ logits, labels = eval_pred
48
+ predictions = np.argmax(logits, axis=-1)
49
+ return metric.compute(predictions=predictions, references=labels)
50
+
51
+ from transformers import TrainingArguments, Trainer
52
+
53
+ training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
54
+
55
+ trainer = Trainer(
56
+ model=model,
57
+ args=training_args,
58
+ train_dataset=small_train_dataset,
59
+ eval_dataset=small_eval_dataset,
60
+ compute_metrics=compute_metrics,
61
+ )
62
+
63
+ trainer.train()
script.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Finetune.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1b_AA5GHhblSKrQymYs_uYYDEqvqklfrV
8
+ """
9
+ from datasets import load_dataset
10
+
11
+ dataset = load_dataset("yelp_review_full")
12
+ dataset["train"][100]
13
+
14
+ from transformers import AutoTokenizer
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
17
+
18
+
19
+ def tokenize_function(examples):
20
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
21
+
22
+
23
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
24
+
25
+ small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
26
+ small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))
27
+
28
+ from transformers import AutoModelForSequenceClassification
29
+
30
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
31
+
32
+ from transformers import TrainingArguments
33
+
34
+ training_args = TrainingArguments(output_dir="test_trainer")
35
+
36
+ import numpy as np
37
+ import evaluate
38
+
39
+ metric = evaluate.load("accuracy")
40
+
41
+ def compute_metrics(eval_pred):
42
+ logits, labels = eval_pred
43
+ predictions = np.argmax(logits, axis=-1)
44
+ return metric.compute(predictions=predictions, references=labels)
45
+
46
+ from transformers import TrainingArguments, Trainer
47
+
48
+ training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
49
+
50
+ trainer = Trainer(
51
+ model=model,
52
+ args=training_args,
53
+ train_dataset=small_train_dataset,
54
+ eval_dataset=small_eval_dataset,
55
+ compute_metrics=compute_metrics,
56
+ )
57
+
58
+ trainer.train()
59
+
60
+ trainer.push_to_hub()