from datasets import load_dataset, DatasetDict from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments # 데이터셋 로드 dataset = load_dataset('yiyang0101/yiyang-test') # 데이터셋 분리 (80% 훈련, 20% 검증) train_test_split = dataset['train'].train_test_split(test_size=0.2) datasets_split = DatasetDict({ 'train': train_test_split['train'], 'validation': train_test_split['test'] }) # 토크나이저 및 모델 로드 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) # 데이터셋 토큰화 def tokenize_function(example): return tokenizer(example['text'], padding="max_length", truncation=True) # 분리된 데이터셋에 대해 토큰화 적용 tokenized_datasets = datasets_split.map(tokenize_function, batched=True) # 학습 설정 training_args = TrainingArguments( output_dir="./yiyang-test", # 학습 결과 저장 경로 evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, ) # Trainer 설정 trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['validation'], # 검증 데이터셋 추가 ) # 모델 학습 trainer.train() # 모델 저장 trainer.save_model("./") # 모델 푸시 trainer.push_to_hub("yiyang0101/yiyang-test") # 토크나이저 푸시 tokenizer.push_to_hub("yiyang0101/yiyang-test")