Chenyan99 commited on
Commit
a37e8ed
1 Parent(s): f7c26fe

Upload 9 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<from>": 50265,
3
+ "<to>": 50266
4
+ }
main.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+
4
+ import numpy as np
5
+ import torch.nn as nn
6
+ from huggingface_hub import PyTorchModelHubMixin
7
+ from transformers import EncoderDecoderModel, RobertaTokenizerFast, PreTrainedModel
8
+ from torch.utils.data import DataLoader, TensorDataset
9
+
10
+ class DependencyAnalyzer(nn.Module, PyTorchModelHubMixin):
11
+ def __init__(self, encoder: PreTrainedModel | None = None,
12
+ match_tokenizer: RobertaTokenizerFast | None = None):
13
+ super(DependencyAnalyzer, self).__init__()
14
+ if not encoder:
15
+ encoder: PreTrainedModel = EncoderDecoderModel.from_encoder_decoder_pretrained("microsoft/codebert-base", "microsoft/codebert-base").encoder
16
+ if match_tokenizer:
17
+ encoder.resize_token_embeddings(len(match_tokenizer))
18
+ encoder.config.decoder_start_token_id = match_tokenizer.cls_token_id
19
+ encoder.config.pad_token_id = match_tokenizer.pad_token_id
20
+ encoder.config.eos_token_id = match_tokenizer.sep_token_id
21
+ encoder.config.vocab_size = match_tokenizer.vocab_size
22
+ self.encoder = encoder
23
+ self.dense = nn.Linear(768, 2)
24
+
25
+ def forward(self, input_ids, attention_mask):
26
+ outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
27
+ pooler_output = outputs.pooler_output
28
+ output_2d = self.dense(pooler_output)
29
+ return output_2d
30
+
31
+ def load_model_and_tokenizer(model_dir, directly_load = True, model_with_structure_dir = None):
32
+ if directly_load:
33
+ tokenizer = RobertaTokenizerFast.from_pretrained(model_dir)
34
+ if model_with_structure_dir:
35
+ model = DependencyAnalyzer.from_pretrained(model_with_structure_dir)
36
+ else:
37
+ model = DependencyAnalyzer(match_tokenizer=tokenizer)
38
+ model.load_state_dict(torch.load(os.path.join(model_dir,'pytorch_model.bin')))
39
+ return model, tokenizer
40
+
41
+ model = EncoderDecoderModel.from_pretrained(model_dir)
42
+ if not isinstance(model, EncoderDecoderModel):
43
+ raise RuntimeError(f"Model read from {model_dir} is not valid")
44
+ model = model.encoder
45
+ if not isinstance(model, PreTrainedModel):
46
+ raise RuntimeError(f"Encoder of original model is not valid")
47
+
48
+ tokenizer: RobertaTokenizerFast = RobertaTokenizerFast.from_pretrained("microsoft/codebert-base")
49
+ if not isinstance(tokenizer, RobertaTokenizerFast):
50
+ raise RuntimeError("Cannot read tokenizer as microsoft/codebert-base")
51
+ special_tokens = ['<from>', '<to>']
52
+ # tokenizer.add_tokens(my_tokens, special_tokens = False)
53
+ tokenizer.add_tokens(special_tokens, special_tokens = True)
54
+
55
+ model = DependencyAnalyzer(model, tokenizer)
56
+
57
+ return model, tokenizer
58
+
59
+ class DependencyClassifier:
60
+ def __init__(self, load_dir, load_with_model_struture=False):
61
+ self.model, self.tokenizer = load_model_and_tokenizer(load_dir, model_with_structure_dir=load_dir) \
62
+ if load_with_model_struture \
63
+ else load_model_and_tokenizer(load_dir)
64
+ if torch.cuda.is_available():
65
+ self.model.to(torch.device('cuda:1'))
66
+
67
+ def construct_pair(self, code_1: str, code_2: str):
68
+ return '<from>' + code_1 + '<to>' + code_2
69
+
70
+ def construct_corpus_pair(self, corpus: list[tuple[str, str]]):
71
+ return [self.construct_pair(code_1, code_2) for code_1, code_2 in corpus]
72
+
73
+ def gen(self, text: str):
74
+ sigmoid = nn.Sigmoid()
75
+ token_input = self.tokenizer(text, return_tensors='pt') # ATTENTION: converted to batch here
76
+ if torch.cuda.is_available():
77
+ token_input = token_input.to(torch.device('cuda:1'))
78
+
79
+ with torch.no_grad():
80
+ outputs = self.model(
81
+ input_ids=token_input['input_ids'],
82
+ attention_mask=token_input['attention_mask']
83
+ )[0]
84
+ outputs = sigmoid(outputs).detach().cpu()
85
+ return outputs[1]
86
+
87
+ def batch_gen(self, corpus_pair: list[str]):
88
+ sigmoid = nn.Sigmoid()
89
+ device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
90
+ token_input = self.tokenizer(corpus_pair, return_tensors='pt', padding=True, truncation=True, max_length=512)
91
+ dataset = TensorDataset(token_input["input_ids"], token_input["attention_mask"])
92
+ dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
93
+
94
+ preds = []
95
+ with torch.no_grad():
96
+ for batch in dataloader:
97
+ batch_input, attention_mask = [item.to(device) for item in batch]
98
+ outputs = self.model(input_ids=batch_input, attention_mask=attention_mask)
99
+ outputs = sigmoid(outputs)[:,1]
100
+ preds.append(outputs.detach().cpu())
101
+ preds = torch.cat(preds, dim=0)
102
+ return preds.numpy()
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d1fb9be53ed3766622caeb3f01af5be70ff0d18645d20904ba0b4a63f34bb0b
3
+ size 498678894
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "50265": {
45
+ "content": "<from>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "50266": {
53
+ "content": "<to>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ }
60
+ },
61
+ "bos_token": "<s>",
62
+ "clean_up_tokenization_spaces": true,
63
+ "cls_token": "<s>",
64
+ "eos_token": "</s>",
65
+ "errors": "replace",
66
+ "mask_token": "<mask>",
67
+ "model_max_length": 512,
68
+ "pad_token": "<pad>",
69
+ "sep_token": "</s>",
70
+ "tokenizer_class": "RobertaTokenizer",
71
+ "trim_offsets": true,
72
+ "unk_token": "<unk>"
73
+ }
trainer_state.json ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.04831862449645996,
3
+ "best_model_checkpoint": "./model/new-14/checkpoint-4410",
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4410,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "learning_rate": 1.1337868480725626e-05,
14
+ "loss": 0.4789,
15
+ "step": 100
16
+ },
17
+ {
18
+ "epoch": 0.05,
19
+ "learning_rate": 2.267573696145125e-05,
20
+ "loss": 0.2222,
21
+ "step": 200
22
+ },
23
+ {
24
+ "epoch": 0.07,
25
+ "learning_rate": 3.401360544217687e-05,
26
+ "loss": 0.109,
27
+ "step": 300
28
+ },
29
+ {
30
+ "epoch": 0.09,
31
+ "learning_rate": 4.53514739229025e-05,
32
+ "loss": 0.072,
33
+ "step": 400
34
+ },
35
+ {
36
+ "epoch": 0.11,
37
+ "learning_rate": 4.964792934717747e-05,
38
+ "loss": 0.0751,
39
+ "step": 500
40
+ },
41
+ {
42
+ "epoch": 0.14,
43
+ "learning_rate": 4.905119942713928e-05,
44
+ "loss": 0.0525,
45
+ "step": 600
46
+ },
47
+ {
48
+ "epoch": 0.16,
49
+ "learning_rate": 4.8454469507101085e-05,
50
+ "loss": 0.0604,
51
+ "step": 700
52
+ },
53
+ {
54
+ "epoch": 0.18,
55
+ "learning_rate": 4.78577395870629e-05,
56
+ "loss": 0.0512,
57
+ "step": 800
58
+ },
59
+ {
60
+ "epoch": 0.2,
61
+ "learning_rate": 4.726100966702471e-05,
62
+ "loss": 0.0659,
63
+ "step": 900
64
+ },
65
+ {
66
+ "epoch": 0.23,
67
+ "learning_rate": 4.6664279746986514e-05,
68
+ "loss": 0.0771,
69
+ "step": 1000
70
+ },
71
+ {
72
+ "epoch": 0.25,
73
+ "learning_rate": 4.6067549826948325e-05,
74
+ "loss": 0.0599,
75
+ "step": 1100
76
+ },
77
+ {
78
+ "epoch": 0.27,
79
+ "learning_rate": 4.547081990691014e-05,
80
+ "loss": 0.054,
81
+ "step": 1200
82
+ },
83
+ {
84
+ "epoch": 0.29,
85
+ "learning_rate": 4.487408998687194e-05,
86
+ "loss": 0.0655,
87
+ "step": 1300
88
+ },
89
+ {
90
+ "epoch": 0.32,
91
+ "learning_rate": 4.4277360066833754e-05,
92
+ "loss": 0.0838,
93
+ "step": 1400
94
+ },
95
+ {
96
+ "epoch": 0.34,
97
+ "learning_rate": 4.3680630146795565e-05,
98
+ "loss": 0.0633,
99
+ "step": 1500
100
+ },
101
+ {
102
+ "epoch": 0.36,
103
+ "learning_rate": 4.308390022675737e-05,
104
+ "loss": 0.0608,
105
+ "step": 1600
106
+ },
107
+ {
108
+ "epoch": 0.39,
109
+ "learning_rate": 4.248717030671918e-05,
110
+ "loss": 0.0577,
111
+ "step": 1700
112
+ },
113
+ {
114
+ "epoch": 0.41,
115
+ "learning_rate": 4.1890440386680994e-05,
116
+ "loss": 0.0543,
117
+ "step": 1800
118
+ },
119
+ {
120
+ "epoch": 0.43,
121
+ "learning_rate": 4.12937104666428e-05,
122
+ "loss": 0.0464,
123
+ "step": 1900
124
+ },
125
+ {
126
+ "epoch": 0.45,
127
+ "learning_rate": 4.069698054660461e-05,
128
+ "loss": 0.0422,
129
+ "step": 2000
130
+ },
131
+ {
132
+ "epoch": 0.48,
133
+ "learning_rate": 4.0100250626566415e-05,
134
+ "loss": 0.0642,
135
+ "step": 2100
136
+ },
137
+ {
138
+ "epoch": 0.5,
139
+ "learning_rate": 3.950352070652823e-05,
140
+ "loss": 0.0528,
141
+ "step": 2200
142
+ },
143
+ {
144
+ "epoch": 0.52,
145
+ "learning_rate": 3.890679078649004e-05,
146
+ "loss": 0.0511,
147
+ "step": 2300
148
+ },
149
+ {
150
+ "epoch": 0.54,
151
+ "learning_rate": 3.831006086645185e-05,
152
+ "loss": 0.0706,
153
+ "step": 2400
154
+ },
155
+ {
156
+ "epoch": 0.57,
157
+ "learning_rate": 3.7713330946413655e-05,
158
+ "loss": 0.0474,
159
+ "step": 2500
160
+ },
161
+ {
162
+ "epoch": 0.59,
163
+ "learning_rate": 3.711660102637547e-05,
164
+ "loss": 0.0566,
165
+ "step": 2600
166
+ },
167
+ {
168
+ "epoch": 0.61,
169
+ "learning_rate": 3.651987110633727e-05,
170
+ "loss": 0.0557,
171
+ "step": 2700
172
+ },
173
+ {
174
+ "epoch": 0.63,
175
+ "learning_rate": 3.592314118629908e-05,
176
+ "loss": 0.0537,
177
+ "step": 2800
178
+ },
179
+ {
180
+ "epoch": 0.66,
181
+ "learning_rate": 3.532641126626089e-05,
182
+ "loss": 0.0646,
183
+ "step": 2900
184
+ },
185
+ {
186
+ "epoch": 0.68,
187
+ "learning_rate": 3.4729681346222707e-05,
188
+ "loss": 0.07,
189
+ "step": 3000
190
+ },
191
+ {
192
+ "epoch": 0.7,
193
+ "learning_rate": 3.413295142618451e-05,
194
+ "loss": 0.0508,
195
+ "step": 3100
196
+ },
197
+ {
198
+ "epoch": 0.73,
199
+ "learning_rate": 3.353622150614632e-05,
200
+ "loss": 0.0521,
201
+ "step": 3200
202
+ },
203
+ {
204
+ "epoch": 0.75,
205
+ "learning_rate": 3.293949158610813e-05,
206
+ "loss": 0.0636,
207
+ "step": 3300
208
+ },
209
+ {
210
+ "epoch": 0.77,
211
+ "learning_rate": 3.234276166606994e-05,
212
+ "loss": 0.0657,
213
+ "step": 3400
214
+ },
215
+ {
216
+ "epoch": 0.79,
217
+ "learning_rate": 3.1746031746031745e-05,
218
+ "loss": 0.0532,
219
+ "step": 3500
220
+ },
221
+ {
222
+ "epoch": 0.82,
223
+ "learning_rate": 3.1149301825993556e-05,
224
+ "loss": 0.0574,
225
+ "step": 3600
226
+ },
227
+ {
228
+ "epoch": 0.84,
229
+ "learning_rate": 3.055257190595537e-05,
230
+ "loss": 0.0425,
231
+ "step": 3700
232
+ },
233
+ {
234
+ "epoch": 0.86,
235
+ "learning_rate": 2.9955841985917176e-05,
236
+ "loss": 0.0545,
237
+ "step": 3800
238
+ },
239
+ {
240
+ "epoch": 0.88,
241
+ "learning_rate": 2.9359112065878985e-05,
242
+ "loss": 0.0624,
243
+ "step": 3900
244
+ },
245
+ {
246
+ "epoch": 0.91,
247
+ "learning_rate": 2.8762382145840793e-05,
248
+ "loss": 0.0488,
249
+ "step": 4000
250
+ },
251
+ {
252
+ "epoch": 0.93,
253
+ "learning_rate": 2.81656522258026e-05,
254
+ "loss": 0.0462,
255
+ "step": 4100
256
+ },
257
+ {
258
+ "epoch": 0.95,
259
+ "learning_rate": 2.756892230576441e-05,
260
+ "loss": 0.0531,
261
+ "step": 4200
262
+ },
263
+ {
264
+ "epoch": 0.98,
265
+ "learning_rate": 2.6972192385726218e-05,
266
+ "loss": 0.0476,
267
+ "step": 4300
268
+ },
269
+ {
270
+ "epoch": 1.0,
271
+ "learning_rate": 2.6375462465688033e-05,
272
+ "loss": 0.0537,
273
+ "step": 4400
274
+ },
275
+ {
276
+ "epoch": 1.0,
277
+ "eval_f1_1": {
278
+ "f1": 0.9362880886426593
279
+ },
280
+ "eval_f1_2": {
281
+ "f1": 0.9555555555555556
282
+ },
283
+ "eval_loss": 0.04831862449645996,
284
+ "eval_precision_1": {
285
+ "precision": 0.9548022598870056
286
+ },
287
+ "eval_precision_2": {
288
+ "precision": 0.9666424945612763
289
+ },
290
+ "eval_recall_1": {
291
+ "recall": 0.9184782608695652
292
+ },
293
+ "eval_recall_2": {
294
+ "recall": 0.9447200566973778
295
+ },
296
+ "eval_runtime": 173.0776,
297
+ "eval_samples_per_second": 58.24,
298
+ "eval_steps_per_second": 3.64,
299
+ "step": 4410
300
+ }
301
+ ],
302
+ "logging_steps": 100,
303
+ "max_steps": 8820,
304
+ "num_train_epochs": 2,
305
+ "save_steps": 500,
306
+ "total_flos": 0.0,
307
+ "trial_name": null,
308
+ "trial_params": null
309
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff