yiyang-test / train.py
yiyang0101's picture
feat: ๋ชจ๋ธ ํ‘ธ์‹œ ๋ฐ ํ•™์Šต ์„ค์ • ์ˆ˜์ •
c4ba368
raw
history blame
1.92 kB
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
dataset = load_dataset('yiyang0101/yiyang-test')
# ๋ฐ์ดํ„ฐ์…‹ ๋ถ„๋ฆฌ (80% ํ›ˆ๋ จ, 20% ๊ฒ€์ฆ)
train_test_split = dataset['train'].train_test_split(test_size=0.2)
datasets_split = DatasetDict({
'train': train_test_split['train'],
'validation': train_test_split['test']
})
# ํ† ํฌ๋‚˜์ด์ € ๋ฐ ๋ชจ๋ธ ๋กœ๋“œ
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# ๋ฐ์ดํ„ฐ์…‹ ํ† ํฐํ™”
def tokenize_function(example):
return tokenizer(example['text'], padding="max_length", truncation=True)
# ๋ถ„๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹์— ๋Œ€ํ•ด ํ† ํฐํ™” ์ ์šฉ
tokenized_datasets = datasets_split.map(tokenize_function, batched=True)
# ํ•™์Šต ์„ค์ •
training_args = TrainingArguments(
output_dir="./results", # ํ•™์Šต ๊ฒฐ๊ณผ ์ €์žฅ ๊ฒฝ๋กœ (๋ชจ๋ธ ํ‘ธ์‹œ์™€๋Š” ๊ด€๋ จ ์—†์Œ)
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=False # push_to_hub ์„ค์ •์„ False๋กœ ์„ค์ •ํ•˜์—ฌ ์ž๋™ ํ‘ธ์‹œ ๋ฐฉ์ง€
)
# Trainer ์„ค์ •
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'], # ๊ฒ€์ฆ ๋ฐ์ดํ„ฐ์…‹ ์ถ”๊ฐ€
)
# ๋ชจ๋ธ ํ•™์Šต
trainer.train()
# ๋ชจ๋ธ ์ €์žฅ (๋ฃจํŠธ ๊ฒฝ๋กœ์— ์ €์žฅ)
trainer.save_model("./") # ๋ชจ๋ธ ํŒŒ์ผ์€ ๋ฃจํŠธ ๊ฒฝ๋กœ์— ์ €์žฅ
# ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์ง์ ‘ ์ง€์ •ํ•œ ๊ฒฝ๋กœ๋กœ ํ‘ธ์‹œ
trainer.push_to_hub(repo_id="yiyang0101/yiyang-test", use_temp_dir=True) # ์ž„์‹œ ํด๋” ์‚ฌ์šฉ์œผ๋กœ ๊ฒฝ๋กœ ๋ฌธ์ œ ๋ฐฉ์ง€
tokenizer.push_to_hub(repo_id="yiyang0101/yiyang-test")