Upload 3 files
Browse files- autotrain_llm.py +97 -0
- finetune.py +63 -0
- script.py +60 -0
autotrain_llm.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""AutoTrain_LLM.ipynb
|
3 |
+
Automatically generated by Colab.
|
4 |
+
Original file is located at
|
5 |
+
https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/colabs/AutoTrain_LLM.ipynb
|
6 |
+
"""
|
7 |
+
|
8 |
+
#@title 🤗 AutoTrain LLM
|
9 |
+
#@markdown In order to use this colab
|
10 |
+
#@markdown - upload train.csv to a folder named `data/`
|
11 |
+
#@markdown - train.csv must contain a `text` column
|
12 |
+
#@markdown - choose a project name if you wish
|
13 |
+
#@markdown - change model if you wish, you can use most of the text-generation models from Hugging Face Hub
|
14 |
+
#@markdown - add huggingface information (token) if you wish to push trained model to huggingface hub
|
15 |
+
#@markdown - update hyperparameters if you wish
|
16 |
+
#@markdown - click `Runtime > Run all` or run each cell individually
|
17 |
+
#@markdown - report issues / feature requests here: https://github.com/huggingface/autotrain-advanced/issues
|
18 |
+
|
19 |
+
import os
|
20 |
+
!pip install -U autotrain-advanced > install_logs.txt
|
21 |
+
!autotrain setup --colab > setup_logs.txt
|
22 |
+
|
23 |
+
#@markdown ---
|
24 |
+
#@markdown #### Project Config
|
25 |
+
#@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.
|
26 |
+
project_name = 'my-autotrain-llm' # @param {type:"string"}
|
27 |
+
model_name = 'abhishek/llama-2-7b-hf-small-shards' # @param {type:"string"}
|
28 |
+
|
29 |
+
#@markdown ---
|
30 |
+
#@markdown #### Push to Hub?
|
31 |
+
#@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account
|
32 |
+
#@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.
|
33 |
+
#@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.
|
34 |
+
#@markdown You can find your token here: https://huggingface.co/settings/tokens
|
35 |
+
push_to_hub = False # @param ["False", "True"] {type:"raw"}
|
36 |
+
hf_token = "hf_XXX" #@param {type:"string"}
|
37 |
+
hf_username = "abc" #@param {type:"string"}
|
38 |
+
|
39 |
+
#@markdown ---
|
40 |
+
#@markdown #### Hyperparameters
|
41 |
+
learning_rate = 2e-4 # @param {type:"number"}
|
42 |
+
num_epochs = 1 #@param {type:"number"}
|
43 |
+
batch_size = 1 # @param {type:"slider", min:1, max:32, step:1}
|
44 |
+
block_size = 1024 # @param {type:"number"}
|
45 |
+
trainer = "sft" # @param ["default", "sft", "orpo"] {type:"raw"}
|
46 |
+
warmup_ratio = 0.1 # @param {type:"number"}
|
47 |
+
weight_decay = 0.01 # @param {type:"number"}
|
48 |
+
gradient_accumulation = 4 # @param {type:"number"}
|
49 |
+
mixed_precision = "fp16" # @param ["fp16", "bf16", "none"] {type:"raw"}
|
50 |
+
peft = True # @param ["False", "True"] {type:"raw"}
|
51 |
+
quantization = "int4" # @param ["int4", "int8", "none"] {type:"raw"}
|
52 |
+
lora_r = 16 #@param {type:"number"}
|
53 |
+
lora_alpha = 32 #@param {type:"number"}
|
54 |
+
lora_dropout = 0.05 #@param {type:"number"}
|
55 |
+
|
56 |
+
os.environ["PROJECT_NAME"] = project_name
|
57 |
+
os.environ["MODEL_NAME"] = model_name
|
58 |
+
os.environ["PUSH_TO_HUB"] = str(push_to_hub)
|
59 |
+
os.environ["HF_TOKEN"] = hf_token
|
60 |
+
os.environ["LEARNING_RATE"] = str(learning_rate)
|
61 |
+
os.environ["NUM_EPOCHS"] = str(num_epochs)
|
62 |
+
os.environ["BATCH_SIZE"] = str(batch_size)
|
63 |
+
os.environ["BLOCK_SIZE"] = str(block_size)
|
64 |
+
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
|
65 |
+
os.environ["WEIGHT_DECAY"] = str(weight_decay)
|
66 |
+
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
|
67 |
+
os.environ["MIXED_PRECISION"] = str(mixed_precision)
|
68 |
+
os.environ["PEFT"] = str(peft)
|
69 |
+
os.environ["QUANTIZATION"] = str(quantization)
|
70 |
+
os.environ["LORA_R"] = str(lora_r)
|
71 |
+
os.environ["LORA_ALPHA"] = str(lora_alpha)
|
72 |
+
os.environ["LORA_DROPOUT"] = str(lora_dropout)
|
73 |
+
os.environ["HF_USERNAME"] = hf_username
|
74 |
+
os.environ["TRAINER"] = trainer
|
75 |
+
|
76 |
+
!autotrain llm \
|
77 |
+
--train \
|
78 |
+
--model ${MODEL_NAME} \
|
79 |
+
--project-name ${PROJECT_NAME} \
|
80 |
+
--data-path data/ \
|
81 |
+
--text-column text \
|
82 |
+
--lr ${LEARNING_RATE} \
|
83 |
+
--batch-size ${BATCH_SIZE} \
|
84 |
+
--epochs ${NUM_EPOCHS} \
|
85 |
+
--block-size ${BLOCK_SIZE} \
|
86 |
+
--warmup-ratio ${WARMUP_RATIO} \
|
87 |
+
--lora-r ${LORA_R} \
|
88 |
+
--lora-alpha ${LORA_ALPHA} \
|
89 |
+
--lora-dropout ${LORA_DROPOUT} \
|
90 |
+
--weight-decay ${WEIGHT_DECAY} \
|
91 |
+
--gradient-accumulation ${GRADIENT_ACCUMULATION} \
|
92 |
+
--quantization ${QUANTIZATION} \
|
93 |
+
--mixed-precision ${MIXED_PRECISION} \
|
94 |
+
--username ${HF_USERNAME} \
|
95 |
+
--trainer ${TRAINER} \
|
96 |
+
$( [[ "$PEFT" == "True" ]] && echo "--peft" ) \
|
97 |
+
$( [[ "$PUSH_TO_HUB" == "True" ]] && echo "--push-to-hub --token ${HF_TOKEN}" )
|
finetune.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Finetune.ipynb
|
3 |
+
Automatically generated by Colab.
|
4 |
+
Original file is located at
|
5 |
+
https://colab.research.google.com/drive/1b_AA5GHhblSKrQymYs_uYYDEqvqklfrV
|
6 |
+
"""
|
7 |
+
|
8 |
+
!pip install datasets transformers[torch]
|
9 |
+
|
10 |
+
!pip install evaluate
|
11 |
+
|
12 |
+
!pip install accelerate -U
|
13 |
+
|
14 |
+
from datasets import load_dataset
|
15 |
+
|
16 |
+
dataset = load_dataset("yelp_review_full")
|
17 |
+
dataset["train"][100]
|
18 |
+
|
19 |
+
from transformers import AutoTokenizer
|
20 |
+
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
22 |
+
|
23 |
+
|
24 |
+
def tokenize_function(examples):
|
25 |
+
return tokenizer(examples["text"], padding="max_length", truncation=True)
|
26 |
+
|
27 |
+
|
28 |
+
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
29 |
+
|
30 |
+
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
|
31 |
+
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
|
32 |
+
|
33 |
+
from transformers import AutoModelForSequenceClassification
|
34 |
+
|
35 |
+
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
|
36 |
+
|
37 |
+
from transformers import TrainingArguments
|
38 |
+
|
39 |
+
training_args = TrainingArguments(output_dir="test_trainer")
|
40 |
+
|
41 |
+
import numpy as np
|
42 |
+
import evaluate
|
43 |
+
|
44 |
+
metric = evaluate.load("accuracy")
|
45 |
+
|
46 |
+
def compute_metrics(eval_pred):
|
47 |
+
logits, labels = eval_pred
|
48 |
+
predictions = np.argmax(logits, axis=-1)
|
49 |
+
return metric.compute(predictions=predictions, references=labels)
|
50 |
+
|
51 |
+
from transformers import TrainingArguments, Trainer
|
52 |
+
|
53 |
+
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
|
54 |
+
|
55 |
+
trainer = Trainer(
|
56 |
+
model=model,
|
57 |
+
args=training_args,
|
58 |
+
train_dataset=small_train_dataset,
|
59 |
+
eval_dataset=small_eval_dataset,
|
60 |
+
compute_metrics=compute_metrics,
|
61 |
+
)
|
62 |
+
|
63 |
+
trainer.train()
|
script.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Finetune.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1b_AA5GHhblSKrQymYs_uYYDEqvqklfrV
|
8 |
+
"""
|
9 |
+
from datasets import load_dataset
|
10 |
+
|
11 |
+
dataset = load_dataset("yelp_review_full")
|
12 |
+
dataset["train"][100]
|
13 |
+
|
14 |
+
from transformers import AutoTokenizer
|
15 |
+
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
17 |
+
|
18 |
+
|
19 |
+
def tokenize_function(examples):
|
20 |
+
return tokenizer(examples["text"], padding="max_length", truncation=True)
|
21 |
+
|
22 |
+
|
23 |
+
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
24 |
+
|
25 |
+
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
|
26 |
+
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))
|
27 |
+
|
28 |
+
from transformers import AutoModelForSequenceClassification
|
29 |
+
|
30 |
+
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
|
31 |
+
|
32 |
+
from transformers import TrainingArguments
|
33 |
+
|
34 |
+
training_args = TrainingArguments(output_dir="test_trainer")
|
35 |
+
|
36 |
+
import numpy as np
|
37 |
+
import evaluate
|
38 |
+
|
39 |
+
metric = evaluate.load("accuracy")
|
40 |
+
|
41 |
+
def compute_metrics(eval_pred):
|
42 |
+
logits, labels = eval_pred
|
43 |
+
predictions = np.argmax(logits, axis=-1)
|
44 |
+
return metric.compute(predictions=predictions, references=labels)
|
45 |
+
|
46 |
+
from transformers import TrainingArguments, Trainer
|
47 |
+
|
48 |
+
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
|
49 |
+
|
50 |
+
trainer = Trainer(
|
51 |
+
model=model,
|
52 |
+
args=training_args,
|
53 |
+
train_dataset=small_train_dataset,
|
54 |
+
eval_dataset=small_eval_dataset,
|
55 |
+
compute_metrics=compute_metrics,
|
56 |
+
)
|
57 |
+
|
58 |
+
trainer.train()
|
59 |
+
|
60 |
+
trainer.push_to_hub()
|