jjmakes commited on
Commit
4b25b72
1 Parent(s): b3fb325

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +231 -0
train.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # John Makely
2
+ # Finetune Language Modeling Based on BERTweet
3
+ # ./jigsaw-toxic-comment-classification-challenge/train.csv
4
+
5
+ # "id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate" [6 total classifiers]
6
+
7
+ # 1. Extract text from csv
8
+ # 2. Tokenize text (BERTweet, RoBERTa, GPT-2)
9
+ # 3. Pass each tokenized text to a model with each classifier
10
+ # 4. Train each model
11
+ # 5. Save each model
12
+
13
+
14
+ import pandas as pd
15
+ import os
16
+ from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
17
+ import torch
18
+ from torch.utils.data import Dataset
19
+ torch.cuda.empty_cache()
20
+
21
+
22
+ # Create Dataset class
23
+ class MultiLabelClassifierDataset(Dataset):
24
+ def __init__(self, encodings, labels):
25
+ self.encodings = encodings
26
+ self.labels = labels
27
+
28
+ def __getitem__(self, idx):
29
+ item = {key: torch.tensor(val[idx])
30
+ for key, val in self.encodings.items()}
31
+ item['labels'] = torch.tensor(self.labels[idx]).float()
32
+ return item
33
+
34
+ def __len__(self):
35
+ return len(self.labels)
36
+
37
+
38
+ # Set up directories
39
+ work_dir = os.path.dirname(os.path.realpath(__file__)) + '/'
40
+ dataset_dir = work_dir + 'jigsaw-toxic-comment-classification-challenge/'
41
+
42
+ # Set up labels
43
+ classifiers = ['toxic', 'severe_toxic', 'obscene',
44
+ 'threat', 'insult', 'identity_hate']
45
+
46
+ # Use train.csv to split into train, val, test
47
+ print("Loading data...")
48
+ df = pd.read_csv(dataset_dir + 'train.csv')
49
+ df = df.sample(frac=1).reset_index(drop=True) # Shuffle
50
+
51
+ # Split into train, val, test
52
+ train_df = df[:int(len(df)*0.1)]
53
+
54
+ # Extracting the last 6 columns into a numpy array
55
+ train_labels = train_df[classifiers].to_numpy()
56
+
57
+ # Setting device
58
+ device = torch.device('cuda')
59
+ print("Using device: ", device)
60
+
61
+
62
+ # # # # # # # # # # # ##
63
+ # # # # # BERT # # # # #
64
+ # # # # # # # # # # # ##
65
+
66
+ training_args = TrainingArguments(
67
+ output_dir='./results',
68
+ num_train_epochs=2,
69
+ per_device_train_batch_size=32,
70
+ per_device_eval_batch_size=64,
71
+ warmup_steps=500,
72
+ weight_decay=0.01,
73
+ logging_dir='./logs',
74
+ logging_steps=10,
75
+ fp16=True
76
+ )
77
+
78
+ print("BERT")
79
+ bert_dir = work_dir + 'bert/'
80
+
81
+ print("Tokenizing")
82
+ print("Model base: ", "vinai/bertweet-base")
83
+ tokenizer = AutoTokenizer.from_pretrained(
84
+ "vinai/bertweet-base", model_max_length=128)
85
+
86
+ print("Creating train encodings...")
87
+ train_encodings = tokenizer(
88
+ train_df['comment_text'].tolist(), truncation=True, padding=True)
89
+
90
+ # def bert_train_model('vinai/bertweet-base', num_labels, training_args, train_encodings, train_dataset, model_dir):
91
+ print("Training model to be stored in" + bert_dir)
92
+
93
+ # # Create dataset
94
+ print("Creating dataset")
95
+ train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
96
+
97
+ # # Load model
98
+ print("Loading model for training...")
99
+ model = AutoModelForSequenceClassification.from_pretrained(
100
+ 'vinai/bertweet-base', num_labels=6)
101
+
102
+ # Create Trainer
103
+ print("Creating trainer...")
104
+ trainer = Trainer(
105
+ model=model,
106
+ args=training_args,
107
+ train_dataset=train_dataset
108
+ )
109
+
110
+ # Train
111
+
112
+ print("Training...")
113
+ trainer.train()
114
+
115
+ # # Save model
116
+ print("Saving model to " + bert_dir + '_bert_model')
117
+ trainer.save_model(bert_dir + '_bert_model')
118
+
119
+
120
+ # # # # # # # # # # # #
121
+ # # # # RoBERTa # # # #
122
+ # # # # # # # # # # # #
123
+
124
+ training_args = TrainingArguments(
125
+ output_dir='./results',
126
+ num_train_epochs=1,
127
+ per_device_train_batch_size=32,
128
+ per_device_eval_batch_size=16,
129
+ warmup_steps=500,
130
+ weight_decay=0.01,
131
+ logging_dir='./logs',
132
+ logging_steps=10,
133
+ fp16=True
134
+ )
135
+
136
+ # RoBERTa
137
+ print("RoBERTa")
138
+ roberta_dir = work_dir + 'roberta/'
139
+
140
+ print("Tokenizing")
141
+ print("Model base: ", 'roberta-base')
142
+ tokenizer = RobertaTokenizer.from_pretrained(
143
+ 'roberta-base', model_max_length=128)
144
+
145
+ train_encodings = tokenizer(
146
+ train_df['comment_text'].tolist(), truncation=True, padding=True)
147
+
148
+
149
+ # Create dataset
150
+ print("Creating dataset")
151
+ train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
152
+
153
+ # Load model
154
+ print("Loading model for training...")
155
+ model = AutoModelForSequenceClassification.from_pretrained(
156
+ 'roberta-base', num_labels=6)
157
+
158
+ # Create Trainer
159
+ print("Creating trainer...")
160
+ trainer = Trainer(
161
+ model=model,
162
+ args=training_args,
163
+ train_dataset=train_dataset
164
+ )
165
+
166
+
167
+ # Train
168
+ print("Training...")
169
+ trainer.train()
170
+
171
+ # Save model
172
+ print("Saving model to " + roberta_dir + '_roberta_model')
173
+ trainer.save_model(roberta_dir + '_roberta_model')
174
+
175
+
176
+ # # # # # # # # # # # ##
177
+ # # # distilbert # # # #
178
+ # # # # # # # # # # # ##
179
+
180
+
181
+ training_args = TrainingArguments(
182
+ output_dir='./results',
183
+ num_train_epochs=1,
184
+ per_device_train_batch_size=32,
185
+ per_device_eval_batch_size=64,
186
+ warmup_steps=500,
187
+ weight_decay=0.01,
188
+ logging_dir='./logs',
189
+ logging_steps=10,
190
+ fp16=True
191
+ )
192
+
193
+
194
+ print("DISTILBERT")
195
+ distilbert_dir = work_dir + 'distilbert/'
196
+
197
+ print("Tokenizing")
198
+ print("Model base: ", 'distilbert-base-cased')
199
+ tokenizer = AutoTokenizer.from_pretrained(
200
+ 'distilbert-base-cased', model_max_length=128)
201
+
202
+ print("Creating train encodings...")
203
+ train_encodings = tokenizer(
204
+ train_df['comment_text'].tolist(), truncation=True, padding=True)
205
+
206
+ print("Training model to be stored in" + distilbert_dir)
207
+
208
+ # Create dataset
209
+ print("Creating dataset")
210
+ train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
211
+
212
+ # Load model
213
+ print("Loading model for training...")
214
+ model = AutoModelForSequenceClassification.from_pretrained(
215
+ 'distilbert-base-cased', num_labels=6)
216
+
217
+ # Create Trainer
218
+ print("Creating trainer...")
219
+ trainer = Trainer(
220
+ model=model,
221
+ args=training_args,
222
+ train_dataset=train_dataset
223
+ )
224
+
225
+ # Train
226
+ print("Training...")
227
+ trainer.train()
228
+
229
+ # Save model
230
+ print("Saving model to " + distilbert_dir + '_distilbert_model')
231
+ trainer.save_model(distilbert_dir + '_distilbert_model')