diff --git a/Model/.gitignore b/Model/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..fb2b5302a6d7ef242f0186f59893fbda118ebdc8 --- /dev/null +++ b/Model/.gitignore @@ -0,0 +1 @@ +!config.py diff --git a/Model/COSMIC/erc_training/commonsense_model.py b/Model/COSMIC/erc_training/commonsense_model.py new file mode 100644 index 0000000000000000000000000000000000000000..43006740bf65f7d7ce13a8a71ce03c8ad9ae49c9 --- /dev/null +++ b/Model/COSMIC/erc_training/commonsense_model.py @@ -0,0 +1,345 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from torch.nn.utils.rnn import pad_sequence +import numpy as np, itertools, random, copy, math +from model import SimpleAttention, MatchingAttention, Attention + +class CommonsenseRNNCell(nn.Module): + + def __init__(self, D_m, D_s, D_g, D_p, D_r, D_i, D_e, listener_state=False, + context_attention='simple', D_a=100, dropout=0.5, emo_gru=True): + super(CommonsenseRNNCell, self).__init__() + + self.D_m = D_m + self.D_s = D_s + self.D_g = D_g + self.D_p = D_p + self.D_r = D_r + self.D_i = D_i + self.D_e = D_e + + # print ('dmsg', D_m, D_s, D_g) + self.g_cell = nn.GRUCell(D_m+D_p+D_r, D_g) + self.p_cell = nn.GRUCell(D_s+D_g, D_p) + self.r_cell = nn.GRUCell(D_m+D_s+D_g, D_r) + self.i_cell = nn.GRUCell(D_s+D_p, D_i) + self.e_cell = nn.GRUCell(D_m+D_p+D_r+D_i, D_e) + + + self.emo_gru = emo_gru + self.listener_state = listener_state + if listener_state: + self.pl_cell = nn.GRUCell(D_s+D_g, D_p) + self.rl_cell = nn.GRUCell(D_m+D_s+D_g, D_r) + + self.dropout = nn.Dropout(dropout) + + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + self.dropout4 = nn.Dropout(dropout) + self.dropout5 = nn.Dropout(dropout) + + if context_attention=='simple': + self.attention = SimpleAttention(D_g) + else: + self.attention = MatchingAttention(D_g, D_m, D_a, context_attention) + + def _select_parties(self, X, indices): + q0_sel = [] + for idx, j in zip(indices, X): + q0_sel.append(j[idx].unsqueeze(0)) + q0_sel = torch.cat(q0_sel,0) + return q0_sel + + def forward(self, U, x1, x2, x3, o1, o2, qmask, g_hist, q0, r0, i0, e0): + """ + U -> batch, D_m + x1, x2, x3, o1, o2 -> batch, D_m + x1 -> effect on self; x2 -> reaction of self; x3 -> intent of self + o1 -> effect on others; o2 -> reaction of others + qmask -> batch, party + g_hist -> t-1, batch, D_g + q0 -> batch, party, D_p + e0 -> batch, self.D_e + """ + qm_idx = torch.argmax(qmask, 1) + q0_sel = self._select_parties(q0, qm_idx) + r0_sel = self._select_parties(r0, qm_idx) + + ## global state ## + g_ = self.g_cell(torch.cat([U, q0_sel, r0_sel], dim=1), + torch.zeros(U.size()[0],self.D_g).type(U.type()) if g_hist.size()[0]==0 else + g_hist[-1]) + # g_ = self.dropout(g_) + + ## context ## + if g_hist.size()[0]==0: + c_ = torch.zeros(U.size()[0], self.D_g).type(U.type()) + alpha = None + else: + c_, alpha = self.attention(g_hist, U) + + ## external state ## + U_r_c_ = torch.cat([U, x2, c_], dim=1).unsqueeze(1).expand(-1, qmask.size()[1],-1) + # print ('urc', U_r_c_.size()) + # print ('u x2, c', U.size(), x2.size(), c_.size()) + rs_ = self.r_cell(U_r_c_.contiguous().view(-1, self.D_m+self.D_s+self.D_g), + r0.view(-1, self.D_r)).view(U.size()[0], -1, self.D_r) + # rs_ = self.dropout(rs_) + + ## internal state ## + es_c_ = torch.cat([x1, c_], dim=1).unsqueeze(1).expand(-1,qmask.size()[1],-1) + qs_ = self.p_cell(es_c_.contiguous().view(-1, self.D_s+self.D_g), + q0.view(-1, self.D_p)).view(U.size()[0], -1, self.D_p) + # qs_ = self.dropout(qs_) + + + if self.listener_state: + ## listener external state ## + U_ = U.unsqueeze(1).expand(-1,qmask.size()[1],-1).contiguous().view(-1,self.D_m) + er_ = o2.unsqueeze(1).expand(-1, qmask.size()[1], -1).contiguous().view(-1, self.D_s) + ss_ = self._select_parties(rs_, qm_idx).unsqueeze(1).\ + expand(-1, qmask.size()[1], -1).contiguous().view(-1, self.D_r) + U_er_ss_ = torch.cat([U_, er_, ss_], 1) + rl_ = self.rl_cell(U_er_ss_, r0.view(-1, self.D_r)).view(U.size()[0], -1, self.D_r) + # rl_ = self.dropout(rl_) + + ## listener internal state ## + es_ = o1.unsqueeze(1).expand(-1, qmask.size()[1], -1).contiguous().view(-1, self.D_s) + ss_ = self._select_parties(qs_, qm_idx).unsqueeze(1).\ + expand(-1, qmask.size()[1], -1).contiguous().view(-1, self.D_p) + es_ss_ = torch.cat([es_, ss_], 1) + ql_ = self.pl_cell(es_ss_, q0.view(-1, self.D_p)).view(U.size()[0], -1, self.D_p) + # ql_ = self.dropout(ql_) + + else: + rl_ = r0 + ql_ = q0 + + qmask_ = qmask.unsqueeze(2) + q_ = ql_*(1-qmask_) + qs_*qmask_ + r_ = rl_*(1-qmask_) + rs_*qmask_ + + ## intent ## + i_q_ = torch.cat([x3, self._select_parties(q_, qm_idx)], dim=1).unsqueeze(1).expand(-1, qmask.size()[1], -1) + is_ = self.i_cell(i_q_.contiguous().view(-1, self.D_s+self.D_p), + i0.view(-1, self.D_i)).view(U.size()[0], -1, self.D_i) + # is_ = self.dropout(is_) + il_ = i0 + i_ = il_*(1-qmask_) + is_*qmask_ + + ## emotion ## + es_ = torch.cat([U, self._select_parties(q_, qm_idx), self._select_parties(r_, qm_idx), + self._select_parties(i_, qm_idx)], dim=1) + e0 = torch.zeros(qmask.size()[0], self.D_e).type(U.type()) if e0.size()[0]==0\ + else e0 + + if self.emo_gru: + e_ = self.e_cell(es_, e0) + else: + e_ = es_ + + # e_ = self.dropout(e_) + g_ = self.dropout1(g_) + q_ = self.dropout2(q_) + r_ = self.dropout3(r_) + i_ = self.dropout4(i_) + e_ = self.dropout5(e_) + + return g_, q_, r_, i_, e_, alpha + + +class CommonsenseRNN(nn.Module): + + def __init__(self, D_m, D_s, D_g, D_p, D_r, D_i, D_e, listener_state=False, + context_attention='simple', D_a=100, dropout=0.5, emo_gru=True): + super(CommonsenseRNN, self).__init__() + + self.D_m = D_m + self.D_g = D_g + self.D_p = D_p + self.D_r = D_r + self.D_i = D_i + self.D_e = D_e + self.dropout = nn.Dropout(dropout) + + self.dialogue_cell = CommonsenseRNNCell(D_m, D_s, D_g, D_p, D_r, D_i, D_e, + listener_state, context_attention, D_a, dropout, emo_gru) + + def forward(self, U, x1, x2, x3, o1, o2, qmask): + """ + U -> seq_len, batch, D_m + x1, x2, x3, o1, o2 -> seq_len, batch, D_s + qmask -> seq_len, batch, party + """ + + g_hist = torch.zeros(0).type(U.type()) # 0-dimensional tensor + q_ = torch.zeros(qmask.size()[1], qmask.size()[2], self.D_p).type(U.type()) # batch, party, D_p + r_ = torch.zeros(qmask.size()[1], qmask.size()[2], self.D_r).type(U.type()) # batch, party, D_r + i_ = torch.zeros(qmask.size()[1], qmask.size()[2], self.D_i).type(U.type()) # batch, party, D_i + + e_ = torch.zeros(0).type(U.type()) # batch, D_e + e = e_ + + alpha = [] + for u_, x1_, x2_, x3_, o1_, o2_, qmask_ in zip(U, x1, x2, x3, o1, o2, qmask): + g_, q_, r_, i_, e_, alpha_ = self.dialogue_cell(u_, x1_, x2_, x3_, o1_, o2_, + qmask_, g_hist, q_, r_, i_, e_) + + g_hist = torch.cat([g_hist, g_.unsqueeze(0)],0) + e = torch.cat([e, e_.unsqueeze(0)],0) + + if type(alpha_)!=type(None): + alpha.append(alpha_[:,0,:]) + + return e, alpha # seq_len, batch, D_e + + +class CommonsenseGRUModel(nn.Module): + + def __init__(self, D_m, D_s, D_g, D_p, D_r, D_i, D_e, D_h, D_a=100, n_classes=7, listener_state=False, + context_attention='simple', dropout_rec=0.5, dropout=0.1, emo_gru=True, mode1=0, norm=0, residual=False): + + super(CommonsenseGRUModel, self).__init__() + + if mode1 == 0: + D_x = 4 * D_m + elif mode1 == 1: + D_x = 2 * D_m + else: + D_x = D_m + + self.mode1 = mode1 + self.norm_strategy = norm + self.linear_in = nn.Linear(D_x, D_h) + self.residual = residual + + self.r_weights = nn.Parameter(torch.tensor([0.25, 0.25, 0.25, 0.25])) + + norm_train = True + self.norm1a = nn.LayerNorm(D_m, elementwise_affine=norm_train) + self.norm1b = nn.LayerNorm(D_m, elementwise_affine=norm_train) + self.norm1c = nn.LayerNorm(D_m, elementwise_affine=norm_train) + self.norm1d = nn.LayerNorm(D_m, elementwise_affine=norm_train) + + self.norm3a = nn.BatchNorm1d(D_m, affine=norm_train) + self.norm3b = nn.BatchNorm1d(D_m, affine=norm_train) + self.norm3c = nn.BatchNorm1d(D_m, affine=norm_train) + self.norm3d = nn.BatchNorm1d(D_m, affine=norm_train) + + self.dropout = nn.Dropout(dropout) + self.dropout_rec = nn.Dropout(dropout_rec) + self.cs_rnn_f = CommonsenseRNN(D_h, D_s, D_g, D_p, D_r, D_i, D_e, listener_state, + context_attention, D_a, dropout_rec, emo_gru) + self.cs_rnn_r = CommonsenseRNN(D_h, D_s, D_g, D_p, D_r, D_i, D_e, listener_state, + context_attention, D_a, dropout_rec, emo_gru) + self.sense_gru = nn.GRU(input_size=D_s, hidden_size=D_s//2, num_layers=1, bidirectional=True) + self.matchatt = MatchingAttention(2*D_e,2*D_e,att_type='general2') + self.linear = nn.Linear(2*D_e, D_h) + self.smax_fc = nn.Linear(D_h, n_classes) + + def _reverse_seq(self, X, mask): + """ + X -> seq_len, batch, dim + mask -> batch, seq_len + """ + X_ = X.transpose(0,1) + mask_sum = torch.sum(mask, 1).int() + + xfs = [] + for x, c in zip(X_, mask_sum): + xf = torch.flip(x[:c], [0]) + xfs.append(xf) + return pad_sequence(xfs) + + def forward(self, r1, r2, r3, r4, x1, x2, x3, o1, o2, qmask, umask, att2=False, return_hidden=False): + """ + U -> seq_len, batch, D_m + qmask -> seq_len, batch, party + """ + + seq_len, batch, feature_dim = r1.size() + + if self.norm_strategy == 1: + r1 = self.norm1a(r1.transpose(0, 1).reshape(-1, feature_dim)).reshape(-1, seq_len, feature_dim).transpose(1, 0) + r2 = self.norm1b(r2.transpose(0, 1).reshape(-1, feature_dim)).reshape(-1, seq_len, feature_dim).transpose(1, 0) + r3 = self.norm1c(r3.transpose(0, 1).reshape(-1, feature_dim)).reshape(-1, seq_len, feature_dim).transpose(1, 0) + r4 = self.norm1d(r4.transpose(0, 1).reshape(-1, feature_dim)).reshape(-1, seq_len, feature_dim).transpose(1, 0) + + elif self.norm_strategy == 2: + norm2 = nn.LayerNorm((seq_len, feature_dim), elementwise_affine=False) + r1 = norm2(r1.transpose(0, 1)).transpose(0, 1) + r2 = norm2(r2.transpose(0, 1)).transpose(0, 1) + r3 = norm2(r3.transpose(0, 1)).transpose(0, 1) + r4 = norm2(r4.transpose(0, 1)).transpose(0, 1) + + elif self.norm_strategy == 3: + r1 = self.norm3a(r1.transpose(0, 1).reshape(-1, feature_dim)).reshape(-1, seq_len, feature_dim).transpose(1, 0) + r2 = self.norm3b(r2.transpose(0, 1).reshape(-1, feature_dim)).reshape(-1, seq_len, feature_dim).transpose(1, 0) + r3 = self.norm3c(r3.transpose(0, 1).reshape(-1, feature_dim)).reshape(-1, seq_len, feature_dim).transpose(1, 0) + r4 = self.norm3d(r4.transpose(0, 1).reshape(-1, feature_dim)).reshape(-1, seq_len, feature_dim).transpose(1, 0) + + if self.mode1 == 0: + r = torch.cat([r1, r2, r3, r4], axis=-1) + elif self.mode1 == 1: + r = torch.cat([r1, r2], axis=-1) + elif self.mode1 == 2: + r = (r1 + r2 + r3 + r4)/4 + elif self.mode1 == 3: + r = r1 + elif self.mode1 == 4: + r = r2 + elif self.mode1 == 5: + r = r3 + elif self.mode1 == 6: + r = r4 + elif self.mode1 == 7: + r = self.r_weights[0]*r1 + self.r_weights[1]*r2 + self.r_weights[2]*r3 + self.r_weights[3]*r4 + + r = self.linear_in(r) + + emotions_f, alpha_f = self.cs_rnn_f(r, x1, x2, x3, o1, o2, qmask) + + out_sense, _ = self.sense_gru(x1) + + rev_r = self._reverse_seq(r, umask) + rev_x1 = self._reverse_seq(x1, umask) + rev_x2 = self._reverse_seq(x2, umask) + rev_x3 = self._reverse_seq(x3, umask) + rev_o1 = self._reverse_seq(o1, umask) + rev_o2 = self._reverse_seq(o2, umask) + rev_qmask = self._reverse_seq(qmask, umask) + emotions_b, alpha_b = self.cs_rnn_r(rev_r, rev_x1, rev_x2, rev_x3, rev_o1, rev_o2, rev_qmask) + emotions_b = self._reverse_seq(emotions_b, umask) + + emotions = torch.cat([emotions_f,emotions_b],dim=-1) + emotions = self.dropout_rec(emotions) + + alpha, alpha_f, alpha_b = [], [], [] + if att2: + att_emotions = [] + alpha = [] + for t in emotions: + att_em, alpha_ = self.matchatt(emotions,t,mask=umask) + att_emotions.append(att_em.unsqueeze(0)) + alpha.append(alpha_[:,0,:]) + att_emotions = torch.cat(att_emotions,dim=0) + hidden = F.relu(self.linear(att_emotions)) + else: + hidden = F.relu(self.linear(emotions)) + + hidden = self.dropout(hidden) + + if self.residual: + hidden = hidden + r + + log_prob = F.log_softmax(self.smax_fc(hidden), 2) + + if return_hidden: + return hidden, alpha, alpha_f, alpha_b, emotions + return log_prob, out_sense, alpha, alpha_f, alpha_b, emotions + + \ No newline at end of file diff --git a/Model/COSMIC/erc_training/dataloader.py b/Model/COSMIC/erc_training/dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..4eace9877eab07dd94987f2e10693591628872bc --- /dev/null +++ b/Model/COSMIC/erc_training/dataloader.py @@ -0,0 +1,276 @@ +import torch +from torch.utils.data import Dataset +from torch.nn.utils.rnn import pad_sequence +import pickle, pandas as pd + +class IEMOCAPRobertaCometDataset(Dataset): + + def __init__(self, split): + ''' + label index mapping = {'hap':0, 'sad':1, 'neu':2, 'ang':3, 'exc':4, 'fru':5} + ''' + self.speakers, self.labels, \ + self.roberta1, self.roberta2, self.roberta3, self.roberta4,\ + self.sentences, self.trainIds, self.testIds, self.validIds \ + = pickle.load(open('iemocap/iemocap_features_roberta.pkl', 'rb'), encoding='latin1') + + self.xIntent, self.xAttr, self.xNeed, self.xWant, self.xEffect, self.xReact, self.oWant, self.oEffect, self.oReact \ + = pickle.load(open('iemocap/iemocap_features_comet.pkl', 'rb'), encoding='latin1') + + if split == 'train': + self.keys = [x for x in self.trainIds] + elif split == 'test': + self.keys = [x for x in self.testIds] + elif split == 'valid': + self.keys = [x for x in self.validIds] + + self.len = len(self.keys) + + def __getitem__(self, index): + vid = self.keys[index] + return torch.FloatTensor(self.roberta1[vid]),\ + torch.FloatTensor(self.roberta2[vid]),\ + torch.FloatTensor(self.roberta3[vid]),\ + torch.FloatTensor(self.roberta4[vid]),\ + torch.FloatTensor(self.xIntent[vid]),\ + torch.FloatTensor(self.xAttr[vid]),\ + torch.FloatTensor(self.xNeed[vid]),\ + torch.FloatTensor(self.xWant[vid]),\ + torch.FloatTensor(self.xEffect[vid]),\ + torch.FloatTensor(self.xReact[vid]),\ + torch.FloatTensor(self.oWant[vid]),\ + torch.FloatTensor(self.oEffect[vid]),\ + torch.FloatTensor(self.oReact[vid]),\ + torch.FloatTensor([[1,0] if x=='M' else [0,1] for x in self.speakers[vid]]),\ + torch.FloatTensor([1]*len(self.labels[vid])),\ + torch.LongTensor(self.labels[vid]),\ + vid + + def __len__(self): + return self.len + + def collate_fn(self, data): + dat = pd.DataFrame(data) + return [pad_sequence(dat[i]) if i<14 else pad_sequence(dat[i], True) if i<16 else dat[i].tolist() for i in dat] + + +class MELDRobertaCometDataset(Dataset): + + def __init__(self, split, classify='emotion'): + ''' + label index mapping = + ''' + self.speakers, self.emotion_labels, self.sentiment_labels, \ + self.roberta1, self.roberta2, self.roberta3, self.roberta4, \ + self.sentences, self.trainIds, self.testIds, self.validIds \ + = pickle.load(open('meld/meld_features_roberta.pkl', 'rb'), encoding='latin1') + + self.xIntent, self.xAttr, self.xNeed, self.xWant, self.xEffect, self.xReact, self.oWant, self.oEffect, self.oReact \ + = pickle.load(open('meld/meld_features_comet.pkl', 'rb'), encoding='latin1') + + if split == 'train': + self.keys = [x for x in self.trainIds] + elif split == 'test': + self.keys = [x for x in self.testIds] + elif split == 'valid': + self.keys = [x for x in self.validIds] + + if classify == 'emotion': + self.labels = self.emotion_labels + else: + self.labels = self.sentiment_labels + + self.len = len(self.keys) + + def __getitem__(self, index): + vid = self.keys[index] + return torch.FloatTensor(self.roberta1[vid]),\ + torch.FloatTensor(self.roberta2[vid]),\ + torch.FloatTensor(self.roberta3[vid]),\ + torch.FloatTensor(self.roberta4[vid]),\ + torch.FloatTensor(self.xIntent[vid]),\ + torch.FloatTensor(self.xAttr[vid]),\ + torch.FloatTensor(self.xNeed[vid]),\ + torch.FloatTensor(self.xWant[vid]),\ + torch.FloatTensor(self.xEffect[vid]),\ + torch.FloatTensor(self.xReact[vid]),\ + torch.FloatTensor(self.oWant[vid]),\ + torch.FloatTensor(self.oEffect[vid]),\ + torch.FloatTensor(self.oReact[vid]),\ + torch.FloatTensor(self.speakers[vid]),\ + torch.FloatTensor([1]*len(self.labels[vid])),\ + torch.LongTensor(self.labels[vid]),\ + vid + + def __len__(self): + return self.len + + def collate_fn(self, data): + dat = pd.DataFrame(data) + return [pad_sequence(dat[i]) if i<14 else pad_sequence(dat[i], True) if i<16 else dat[i].tolist() for i in dat] + +class RobertaCometDataset(Dataset): + + def __init__(self, split, path_roberta="epik/epik_features_roberta.pkl", path_comet="epik/epik_features_comet.pkl"): + self.speakers, self.labels, \ + self.roberta1, self.roberta2, self.roberta3, self.roberta4, \ + self.sentences, self.trainIds, self.testIds, self.validIds \ + = pickle.load(open(path_roberta, 'rb'), encoding='latin1') + + self.xIntent, self.xAttr, self.xNeed, self.xWant, self.xEffect, self.xReact, self.oWant, self.oEffect, self.oReact \ + = pickle.load(open(path_comet, 'rb'), encoding='latin1') + + if split == 'train': + self.keys = [x for x in self.trainIds] + elif split == 'test': + self.keys = [x for x in self.testIds] + elif split == 'valid': + self.keys = [x for x in self.validIds] + + self.len = len(self.keys) + + def __getitem__(self, index): + vid = self.keys[index] + return torch.FloatTensor(self.roberta1[vid]),\ + torch.FloatTensor(self.roberta2[vid]),\ + torch.FloatTensor(self.roberta3[vid]),\ + torch.FloatTensor(self.roberta4[vid]),\ + torch.FloatTensor(self.xIntent[vid]),\ + torch.FloatTensor(self.xAttr[vid]),\ + torch.FloatTensor(self.xNeed[vid]),\ + torch.FloatTensor(self.xWant[vid]),\ + torch.FloatTensor(self.xEffect[vid]),\ + torch.FloatTensor(self.xReact[vid]),\ + torch.FloatTensor(self.oWant[vid]),\ + torch.FloatTensor(self.oEffect[vid]),\ + torch.FloatTensor(self.oReact[vid]),\ + torch.FloatTensor([[1,0] if x=='0' else [0,1] for x in self.speakers[vid]]),\ + torch.FloatTensor([1]*len(self.labels[vid])),\ + torch.LongTensor(self.labels[vid]),\ + vid + + def __len__(self): + return self.len + + def collate_fn(self, data): + dat = pd.DataFrame(data) + return [pad_sequence(dat[i]) if i<14 else pad_sequence(dat[i], True) if i<16 else dat[i].tolist() for i in dat] + + +class DailyDialogueRobertaCometDataset(Dataset): + + def __init__(self, split): + + self.speakers, self.labels, \ + self.roberta1, self.roberta2, self.roberta3, self.roberta4, \ + self.sentences, self.trainIds, self.testIds, self.validIds \ + = pickle.load(open('dailydialog/dailydialog_features_roberta.pkl', 'rb'), encoding='latin1') + + self.xIntent, self.xAttr, self.xNeed, self.xWant, self.xEffect, self.xReact, self.oWant, self.oEffect, self.oReact \ + = pickle.load(open('dailydialog/dailydialog_features_comet.pkl', 'rb'), encoding='latin1') + + if split == 'train': + self.keys = [x for x in self.trainIds] + elif split == 'test': + self.keys = [x for x in self.testIds] + elif split == 'valid': + self.keys = [x for x in self.validIds] + + self.len = len(self.keys) + + def __getitem__(self, index): + vid = self.keys[index] + return torch.FloatTensor(self.roberta1[vid]),\ + torch.FloatTensor(self.roberta2[vid]),\ + torch.FloatTensor(self.roberta3[vid]),\ + torch.FloatTensor(self.roberta4[vid]),\ + torch.FloatTensor(self.xIntent[vid]),\ + torch.FloatTensor(self.xAttr[vid]),\ + torch.FloatTensor(self.xNeed[vid]),\ + torch.FloatTensor(self.xWant[vid]),\ + torch.FloatTensor(self.xEffect[vid]),\ + torch.FloatTensor(self.xReact[vid]),\ + torch.FloatTensor(self.oWant[vid]),\ + torch.FloatTensor(self.oEffect[vid]),\ + torch.FloatTensor(self.oReact[vid]),\ + torch.FloatTensor([[1,0] if x=='0' else [0,1] for x in self.speakers[vid]]),\ + torch.FloatTensor([1]*len(self.labels[vid])),\ + torch.LongTensor(self.labels[vid]),\ + vid + + def __len__(self): + return self.len + + def collate_fn(self, data): + dat = pd.DataFrame(data) + return [pad_sequence(dat[i]) if i<14 else pad_sequence(dat[i], True) if i<16 else dat[i].tolist() for i in dat] + +class EmoryNLPRobertaCometDataset(Dataset): + + def __init__(self, split, classify='emotion'): + + ''' + label index mapping = {'Joyful': 0, 'Mad': 1, 'Peaceful': 2, 'Neutral': 3, 'Sad': 4, 'Powerful': 5, 'Scared': 6} + ''' + + self.speakers, self.emotion_labels, \ + self.roberta1, self.roberta2, self.roberta3, self.roberta4, \ + self.sentences, self.trainId, self.testId, self.validId \ + = pickle.load(open('emorynlp/emorynlp_features_roberta.pkl', 'rb'), encoding='latin1') + + sentiment_labels = {} + for item in self.emotion_labels: + array = [] + # 0 negative, 1 neutral, 2 positive + for e in self.emotion_labels[item]: + if e in [1, 4, 6]: + array.append(0) + elif e == 3: + array.append(1) + elif e in [0, 2, 5]: + array.append(2) + sentiment_labels[item] = array + + self.xIntent, self.xAttr, self.xNeed, self.xWant, self.xEffect, self.xReact, self.oWant, self.oEffect, self.oReact \ + = pickle.load(open('emorynlp/emorynlp_features_comet.pkl', 'rb'), encoding='latin1') + + if split == 'train': + self.keys = [x for x in self.trainId] + elif split == 'test': + self.keys = [x for x in self.testId] + elif split == 'valid': + self.keys = [x for x in self.validId] + + if classify == 'emotion': + self.labels = self.emotion_labels + elif classify == 'sentiment': + self.labels = sentiment_labels + + self.len = len(self.keys) + + def __getitem__(self, index): + vid = self.keys[index] + return torch.FloatTensor(self.roberta1[vid]),\ + torch.FloatTensor(self.roberta2[vid]),\ + torch.FloatTensor(self.roberta3[vid]),\ + torch.FloatTensor(self.roberta4[vid]),\ + torch.FloatTensor(self.xIntent[vid]),\ + torch.FloatTensor(self.xAttr[vid]),\ + torch.FloatTensor(self.xNeed[vid]),\ + torch.FloatTensor(self.xWant[vid]),\ + torch.FloatTensor(self.xEffect[vid]),\ + torch.FloatTensor(self.xReact[vid]),\ + torch.FloatTensor(self.oWant[vid]),\ + torch.FloatTensor(self.oEffect[vid]),\ + torch.FloatTensor(self.oReact[vid]),\ + torch.FloatTensor([[1,0] if x=='0' else [0,1] for x in self.speakers[vid]]),\ + torch.FloatTensor([1]*len(self.labels[vid])),\ + torch.LongTensor(self.labels[vid]),\ + vid + + def __len__(self): + return self.len + + def collate_fn(self, data): + dat = pd.DataFrame(data) + return [pad_sequence(dat[i]) if i<14 else pad_sequence(dat[i], True) if i<16 else dat[i].tolist() for i in dat] diff --git a/Model/COSMIC/erc_training/model.py b/Model/COSMIC/erc_training/model.py new file mode 100644 index 0000000000000000000000000000000000000000..43f9c28d481b7e36b863089cf44ed7539301ff24 --- /dev/null +++ b/Model/COSMIC/erc_training/model.py @@ -0,0 +1,229 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from torch.nn.utils.rnn import pad_sequence + +class MaskedNLLLoss(nn.Module): + + def __init__(self, weight=None): + super(MaskedNLLLoss, self).__init__() + self.weight = weight + self.loss = nn.NLLLoss(weight=weight, + reduction='sum') + + def forward(self, pred, target, mask): + """ + pred -> batch*seq_len, n_classes + target -> batch*seq_len + mask -> batch, seq_len + """ + mask_ = mask.view(-1,1) # batch*seq_len, 1 + if type(self.weight)==type(None): + loss = self.loss(pred*mask_, target)/torch.sum(mask) + else: + loss = self.loss(pred*mask_, target)\ + /torch.sum(self.weight[target]*mask_.squeeze()) + return loss + + +class MaskedMSELoss(nn.Module): + + def __init__(self): + super(MaskedMSELoss, self).__init__() + self.loss = nn.MSELoss(reduction='sum') + + def forward(self, pred, target, mask): + """ + pred -> batch*seq_len + target -> batch*seq_len + mask -> batch*seq_len + """ + loss = self.loss(pred*mask, target)/torch.sum(mask) + return loss + + +class UnMaskedWeightedNLLLoss(nn.Module): + + def __init__(self, weight=None): + super(UnMaskedWeightedNLLLoss, self).__init__() + self.weight = weight + self.loss = nn.NLLLoss(weight=weight, + reduction='sum') + + def forward(self, pred, target): + """ + pred -> batch*seq_len, n_classes + target -> batch*seq_len + """ + if type(self.weight)==type(None): + loss = self.loss(pred, target) + else: + loss = self.loss(pred, target)\ + /torch.sum(self.weight[target]) + return loss + + +class SimpleAttention(nn.Module): + + def __init__(self, input_dim): + super(SimpleAttention, self).__init__() + self.input_dim = input_dim + self.scalar = nn.Linear(self.input_dim,1,bias=False) + + def forward(self, M, x=None): + """ + M -> (seq_len, batch, vector) + x -> dummy argument for the compatibility with MatchingAttention + """ + scale = self.scalar(M) # seq_len, batch, 1 + alpha = F.softmax(scale, dim=0).permute(1,2,0) # batch, 1, seq_len + attn_pool = torch.bmm(alpha, M.transpose(0,1))[:,0,:] # batch, vector + return attn_pool, alpha + + +class MatchingAttention(nn.Module): + + def __init__(self, mem_dim, cand_dim, alpha_dim=None, att_type='general'): + super(MatchingAttention, self).__init__() + assert att_type!='concat' or alpha_dim!=None + assert att_type!='dot' or mem_dim==cand_dim + self.mem_dim = mem_dim + self.cand_dim = cand_dim + self.att_type = att_type + if att_type=='general': + self.transform = nn.Linear(cand_dim, mem_dim, bias=False) + if att_type=='general2': + self.transform = nn.Linear(cand_dim, mem_dim, bias=True) + #torch.nn.init.normal_(self.transform.weight,std=0.01) + elif att_type=='concat': + self.transform = nn.Linear(cand_dim+mem_dim, alpha_dim, bias=False) + self.vector_prod = nn.Linear(alpha_dim, 1, bias=False) + + def forward(self, M, x, mask=None): + """ + M -> (seq_len, batch, mem_dim) + x -> (batch, cand_dim) + mask -> (batch, seq_len) + """ + if type(mask)==type(None): + mask = torch.ones(M.size(1), M.size(0)).type(M.type()) + + if self.att_type=='dot': + # vector = cand_dim = mem_dim + M_ = M.permute(1,2,0) # batch, vector, seqlen + x_ = x.unsqueeze(1) # batch, 1, vector + alpha = F.softmax(torch.bmm(x_, M_), dim=2) # batch, 1, seqlen + elif self.att_type=='general': + M_ = M.permute(1,2,0) # batch, mem_dim, seqlen + x_ = self.transform(x).unsqueeze(1) # batch, 1, mem_dim + alpha = F.softmax(torch.bmm(x_, M_), dim=2) # batch, 1, seqlen + elif self.att_type=='general2': + M_ = M.permute(1,2,0) # batch, mem_dim, seqlen + x_ = self.transform(x).unsqueeze(1) # batch, 1, mem_dim + mask_ = mask.unsqueeze(2).repeat(1, 1, self.mem_dim).transpose(1, 2) # batch, seq_len, mem_dim + M_ = M_ * mask_ + alpha_ = torch.bmm(x_, M_)*mask.unsqueeze(1) + alpha_ = torch.tanh(alpha_) + alpha_ = F.softmax(alpha_, dim=2) + # alpha_ = F.softmax((torch.bmm(x_, M_))*mask.unsqueeze(1), dim=2) # batch, 1, seqlen + alpha_masked = alpha_*mask.unsqueeze(1) # batch, 1, seqlen + alpha_sum = torch.sum(alpha_masked, dim=2, keepdim=True) # batch, 1, 1 + alpha = alpha_masked/alpha_sum # batch, 1, 1 ; normalized + #import ipdb;ipdb.set_trace() + else: + M_ = M.transpose(0,1) # batch, seqlen, mem_dim + x_ = x.unsqueeze(1).expand(-1,M.size()[0],-1) # batch, seqlen, cand_dim + M_x_ = torch.cat([M_,x_],2) # batch, seqlen, mem_dim+cand_dim + mx_a = F.tanh(self.transform(M_x_)) # batch, seqlen, alpha_dim + alpha = F.softmax(self.vector_prod(mx_a),1).transpose(1,2) # batch, 1, seqlen + + attn_pool = torch.bmm(alpha, M.transpose(0,1))[:,0,:] # batch, mem_dim + return attn_pool, alpha + + +class Attention(nn.Module): + def __init__(self, embed_dim, hidden_dim=None, out_dim=None, n_head=1, score_function='dot_product', dropout=0): + ''' Attention Mechanism + :param embed_dim: + :param hidden_dim: + :param out_dim: + :param n_head: num of head (Multi-Head Attention) + :param score_function: scaled_dot_product / mlp (concat) / bi_linear (general dot) + :return (?, q_len, out_dim,) + ''' + super(Attention, self).__init__() + if hidden_dim is None: + hidden_dim = embed_dim // n_head + if out_dim is None: + out_dim = embed_dim + self.embed_dim = embed_dim + self.hidden_dim = hidden_dim + self.n_head = n_head + self.score_function = score_function + self.w_k = nn.Linear(embed_dim, n_head * hidden_dim) + self.w_q = nn.Linear(embed_dim, n_head * hidden_dim) + self.proj = nn.Linear(n_head * hidden_dim, out_dim) + self.dropout = nn.Dropout(dropout) + if score_function == 'mlp': + self.weight = nn.Parameter(torch.Tensor(hidden_dim*2)) + elif self.score_function == 'bi_linear': + self.weight = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim)) + else: # dot_product / scaled_dot_product + self.register_parameter('weight', None) + self.reset_parameters() + + def reset_parameters(self): + stdv = 1. / math.sqrt(self.hidden_dim) + if self.weight is not None: + self.weight.data.uniform_(-stdv, stdv) + + def forward(self, k, q): + if len(q.shape) == 2: # q_len missing + q = torch.unsqueeze(q, dim=1) + if len(k.shape) == 2: # k_len missing + k = torch.unsqueeze(k, dim=1) + mb_size = k.shape[0] # ? + k_len = k.shape[1] + q_len = q.shape[1] + # k: (?, k_len, embed_dim,) + # q: (?, q_len, embed_dim,) + # kx: (n_head*?, k_len, hidden_dim) + # qx: (n_head*?, q_len, hidden_dim) + # score: (n_head*?, q_len, k_len,) + # output: (?, q_len, out_dim,) + kx = self.w_k(k).view(mb_size, k_len, self.n_head, self.hidden_dim) + kx = kx.permute(2, 0, 1, 3).contiguous().view(-1, k_len, self.hidden_dim) + qx = self.w_q(q).view(mb_size, q_len, self.n_head, self.hidden_dim) + qx = qx.permute(2, 0, 1, 3).contiguous().view(-1, q_len, self.hidden_dim) + if self.score_function == 'dot_product': + kt = kx.permute(0, 2, 1) + score = torch.bmm(qx, kt) + elif self.score_function == 'scaled_dot_product': + kt = kx.permute(0, 2, 1) + qkt = torch.bmm(qx, kt) + score = torch.div(qkt, math.sqrt(self.hidden_dim)) + elif self.score_function == 'mlp': + kxx = torch.unsqueeze(kx, dim=1).expand(-1, q_len, -1, -1) + qxx = torch.unsqueeze(qx, dim=2).expand(-1, -1, k_len, -1) + kq = torch.cat((kxx, qxx), dim=-1) # (n_head*?, q_len, k_len, hidden_dim*2) + # kq = torch.unsqueeze(kx, dim=1) + torch.unsqueeze(qx, dim=2) + score = torch.tanh(torch.matmul(kq, self.weight)) + elif self.score_function == 'bi_linear': + qw = torch.matmul(qx, self.weight) + kt = kx.permute(0, 2, 1) + score = torch.bmm(qw, kt) + else: + raise RuntimeError('invalid score_function') + #score = F.softmax(score, dim=-1) + score = F.softmax(score, dim=0) + # print (score) + # print (sum(score)) + output = torch.bmm(score, kx) # (n_head*?, q_len, hidden_dim) + output = torch.cat(torch.split(output, mb_size, dim=0), dim=-1) # (?, q_len, n_head*hidden_dim) + output = self.proj(output) # (?, q_len, out_dim) + output = self.dropout(output) + return output, score + + \ No newline at end of file diff --git a/Model/COSMIC/erc_training/predict_epik.py b/Model/COSMIC/erc_training/predict_epik.py new file mode 100644 index 0000000000000000000000000000000000000000..c4566941cdacdb3517930f0b4b3d1c5a086f7cfa --- /dev/null +++ b/Model/COSMIC/erc_training/predict_epik.py @@ -0,0 +1,198 @@ +import torch, argparse +from commonsense_model import CommonsenseGRUModel +from dataloader import RobertaCometDataset +from torch.utils.data import DataLoader + + +def load_model(model_path, args): + emo_gru = True + n_classes = 15 + cuda = args.cuda + + D_m = 1024 + D_s = 768 + D_g = 150 + D_p = 150 + D_r = 150 + D_i = 150 + D_h = 100 + D_a = 100 + D_e = D_p + D_r + D_i + + model = CommonsenseGRUModel( + D_m, + D_s, + D_g, + D_p, + D_r, + D_i, + D_e, + D_h, + D_a, + n_classes=n_classes, + listener_state=args.active_listener, + context_attention=args.attention, + dropout_rec=args.rec_dropout, + dropout=args.dropout, + emo_gru=emo_gru, + mode1=args.mode1, + norm=args.norm, + residual=args.residual, + ) + + if cuda: + model.cuda() + + model.load_state_dict(torch.load(model_path)) + model.eval() + + return model + + +def get_valid_dataloader( + roberta_features_path: str, + comet_features_path: str, + batch_size=1, + num_workers=0, + pin_memory=False, +): + valid_set = RobertaCometDataset("valid", roberta_features_path, comet_features_path) + + test_loader = DataLoader( + valid_set, + batch_size=batch_size, + collate_fn=valid_set.collate_fn, + num_workers=num_workers, + pin_memory=pin_memory, + ) + + return test_loader, valid_set.keys + + +def predict(model, data_loader, args): + predictions = [] + for data in data_loader: + r1, r2, r3, r4, x1, x2, x3, x4, x5, x6, o1, o2, o3, qmask, umask, label = ( + [d.cuda() for d in data[:-1]] if args.cuda else data[:-1] + ) + log_prob, _, alpha, alpha_f, alpha_b, _ = model( + r1, r2, r3, r4, x5, x6, x1, o2, o3, qmask, umask + ) + + lp_ = log_prob.transpose(0, 1).contiguous().view(-1, log_prob.size()[2]) + preds = torch.argmax(lp_, dim=-1) + predictions.append(preds.data.cpu().numpy()) + + return predictions + + +def parse_cosmic_args(): + parser = argparse.ArgumentParser() + + # Parse arguments input into the cosmic model + parser.add_argument( + "--no-cuda", action="store_true", default=False, help="does not use GPU" + ) + parser.add_argument( + "--lr", type=float, default=0.0001, metavar="LR", help="learning rate" + ) + parser.add_argument( + "--l2", + type=float, + default=0.00003, + metavar="L2", + help="L2 regularization weight", + ) + parser.add_argument( + "--rec-dropout", + type=float, + default=0.3, + metavar="rec_dropout", + help="rec_dropout rate", + ) + parser.add_argument( + "--dropout", type=float, default=0.5, metavar="dropout", help="dropout rate" + ) + parser.add_argument( + "--batch-size", type=int, default=1, metavar="BS", help="batch size" + ) + parser.add_argument( + "--epochs", type=int, default=10, metavar="E", help="number of epochs" + ) + parser.add_argument( + "--class-weight", action="store_true", default=True, help="use class weights" + ) + parser.add_argument( + "--active-listener", action="store_true", default=True, help="active listener" + ) + parser.add_argument( + "--attention", default="simple", help="Attention type in context GRU" + ) + parser.add_argument( + "--tensorboard", + action="store_true", + default=False, + help="Enables tensorboard log", + ) + parser.add_argument("--mode1", type=int, default=2, help="Roberta features to use") + parser.add_argument("--seed", type=int, default=500, metavar="seed", help="seed") + parser.add_argument("--norm", type=int, default=0, help="normalization strategy") + parser.add_argument("--mu", type=float, default=0, help="class_weight_mu") + parser.add_argument( + "--residual", action="store_true", default=True, help="use residual connection" + ) + + args = parser.parse_args() + + args.cuda = torch.cuda.is_available() and not args.no_cuda + if args.cuda: + print("Running on GPU") + else: + print("Running on CPU") + + return args + + +if __name__ == "__main__": + + def pred_to_labels(preds): + mapped_predictions = [] + for pred in preds: + # map the prediction for each conversation + mapped_labels = [] + for label in pred: + mapped_labels.append(label_mapping[label]) + + mapped_predictions.append(mapped_labels) + + # return the mapped labels for each conversation + return mapped_predictions + + label_mapping = { + 0: "Curiosity", + 1: "Obscene", + 2: "Informative", + 3: "Openness", + 4: "Acceptance", + 5: "Interest", + 6: "Greeting", + 7: "Disapproval", + 8: "Denial", + 9: "Anxious", + 10: "Uninterested", + 11: "Remorse", + 12: "Confused", + 13: "Accusatory", + 14: "Annoyed", + } + + args = parse_cosmic_args() + + model = load_model("epik/best_model.pt", args) + test_dataloader, ids = get_valid_dataloader() + predicted_labels = pred_to_labels(predict(model, test_dataloader, args)) + + for id, labels in zip(ids, predicted_labels): + print(f"Conversation ID: {id}") + print(f"Predicted Sentiment Labels: {labels}") + print(len(labels)) diff --git a/Model/COSMIC/feature_extraction/comet/__init__.py b/Model/COSMIC/feature_extraction/comet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Model/COSMIC/feature_extraction/comet/csk_feature_extract.py b/Model/COSMIC/feature_extraction/comet/csk_feature_extract.py new file mode 100644 index 0000000000000000000000000000000000000000..a32239cf0cc10b9dd91bc7f6799dcc0ac9e5601a --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/csk_feature_extract.py @@ -0,0 +1,110 @@ +import os +from tqdm import tqdm +from nltk import tokenize +import numpy as np +import pickle, torch +import comet.src.data.data as data +import comet.src.data.config as cfg +import comet.src.models.utils as model_utils +import comet.src.interactive.functions as interactive + + +class CSKFeatureExtractor: + def __init__(self, dir="."): + super(CSKFeatureExtractor, self).__init__() + + device = 0 + model_file = os.path.join( + dir, "comet/pretrained_models/atomic_pretrained_model.pickle" + ) + sampling_algorithm = "beam-5" + category = "all" + + opt, state_dict = interactive.load_model_file(model_file) + data_loader, text_encoder = interactive.load_data("atomic", opt, dir) + + self.opt = opt + self.data_loader = data_loader + self.text_encoder = text_encoder + + n_ctx = data_loader.max_event + data_loader.max_effect + n_vocab = len(text_encoder.encoder) + n_ctx + self.model = interactive.make_model(opt, n_vocab, n_ctx, state_dict) + self.model.eval() + + if device != "cpu": + cfg.device = int(device) + cfg.do_gpu = True + torch.cuda.set_device(cfg.device) + self.model.cuda(cfg.device) + else: + cfg.device = "cpu" + + def set_atomic_inputs(self, input_event, category, data_loader, text_encoder): + XMB = torch.zeros(1, data_loader.max_event + 1).long().to(cfg.device) + prefix, suffix = data.atomic_data.do_example( + text_encoder, input_event, None, True, None + ) + + if len(prefix) > data_loader.max_event + 1: + prefix = prefix[: data_loader.max_event + 1] + + XMB[:, : len(prefix)] = torch.LongTensor(prefix) + XMB[:, -1] = torch.LongTensor([text_encoder.encoder["<{}>".format(category)]]) + + batch = {} + batch["sequences"] = XMB + batch["attention_mask"] = data.atomic_data.make_attention_mask(XMB) + return batch + + def extract(self, sentence): + atomic_keys = [ + "xIntent", + "xAttr", + "xNeed", + "xWant", + "xEffect", + "xReact", + "oWant", + "oEffect", + "oReact", + ] + map1 = [{}, {}, {}, {}, {}, {}, {}, {}, {}] + all_keys = list(sentence.keys()) + + for i in tqdm(range(len(all_keys))): + item = all_keys[i] + list1 = [[], [], [], [], [], [], [], [], []] + + for x in sentence[item]: + input_event = x.encode("ascii", errors="ignore").decode("utf-8") + m1 = [] + for sent in tokenize.sent_tokenize(input_event): + seqs = [] + masks = [] + for category in atomic_keys: + batch = self.set_atomic_inputs( + sent, category, self.data_loader, self.text_encoder + ) + seqs.append(batch["sequences"]) + masks.append(batch["attention_mask"]) + + XMB = torch.cat(seqs) + MMB = torch.cat(masks) + XMB = model_utils.prepare_position_embeddings( + self.opt, self.data_loader.vocab_encoder, XMB.unsqueeze(-1) + ) + h, _ = self.model(XMB.unsqueeze(1), sequence_mask=MMB) + + last_index = MMB[0][:-1].nonzero()[-1].cpu().numpy()[0] + 1 + m1.append(h[:, -1, :].detach().cpu().numpy()) + + m1 = np.mean(np.array(m1), axis=0) + + for k, l1 in enumerate(list1): + l1.append(m1[k]) + + for k, v1 in enumerate(map1): + v1[item] = list1[k] + + return map1 diff --git a/Model/COSMIC/feature_extraction/comet/src/__init__.py b/Model/COSMIC/feature_extraction/comet/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Model/COSMIC/feature_extraction/comet/src/data/atomic.py b/Model/COSMIC/feature_extraction/comet/src/data/atomic.py new file mode 100644 index 0000000000000000000000000000000000000000..85020a6809898915afbf57a3417f3daf9a9b22bd --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/data/atomic.py @@ -0,0 +1,337 @@ +import comet.utils.utils as utils +import comet.src.data.utils as data_utils +import comet.src.data.config as cfg + +import pandas +import json +import random +import math +import torch + +from tqdm import tqdm + + +def map_name(name): + if name == "train": + return "trn" + elif name == "test": + return "tst" + else: + return "dev" + + +class DataLoader(object): + def __init__(self, opt): + self.data = {} + self.data["train"] = {} + self.data["dev"] = {} + self.data["test"] = {} + + self.sequences = {} + self.sequences["train"] = {} + self.sequences["dev"] = {} + self.sequences["test"] = {} + + self.masks = {} + self.masks["train"] = {} + self.masks["dev"] = {} + self.masks["test"] = {} + + self.offsets = {} + self.offsets["train"] = {} + self.offsets["dev"] = {} + self.offsets["test"] = {} + + def offset_summary(self, split): + return self.offsets[split]["total"] + + +def do_take_partial_dataset(data_opts): + if data_opts.get("kr", None) is None: + return False + if data_opts.kr == 1: + return False + return True + + +def select_partial_dataset(data_opts, data): + num_selections = math.ceil(data_opts.kr * len(data)) + return random.sample(data, num_selections) + + +class GenerationDataLoader(DataLoader): + def __init__(self, opt, categories): + super(GenerationDataLoader, self).__init__(opt) + + self.categories = categories + self.opt = opt + + for split in self.data: + self.data[split] = {"total": []} + self.offsets[split] = {"total": 0} + + self.vocab_encoder = None + self.vocab_decoder = None + self.special_chars = None + self.max_event = None + self.max_effect = None + + def load_data(self, path): + if ".pickle" in path: + print("Loading data from: {}".format(path)) + data_utils.load_existing_data_loader(self, path) + + return True + + for split in self.data: + file_name = "v4_atomic_{}.csv".format(map_name(split)) + + df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0) + df.iloc[:, :9] = df.iloc[:, :9].apply( + lambda col: col.apply(json.loads)) + + for cat in self.categories: + attr = df[cat] + self.data[split]["total"] += utils.zipped_flatten(zip( + attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) + + if do_take_partial_dataset(self.opt.data): + self.data["train"]["total"] = select_partial_dataset( + self.opt.data, self.data["train"]["total"]) + + return False + + def make_tensors(self, text_encoder, special, + splits=["train", "dev", "test"], test=False): + self.vocab_encoder = text_encoder.encoder + self.vocab_decoder = text_encoder.decoder + self.special_chars = special + + sequences = {} + for split in splits: + sequences[split] = get_generation_sequences( + self.opt, self.data, split, text_encoder, test) + + self.masks[split]["total"] = [(len(i[0]), len(i[1])) for + i in sequences[split]] + + self.max_event = max([max([l[0] for l in self.masks[split]["total"]]) + for split in self.masks]) + self.max_effect = max([max([l[1] for l in self.masks[split]["total"]]) + for split in self.masks]) + + print(self.max_event) + print(self.max_effect) + + for split in splits: + num_elements = len(sequences[split]) + self.sequences[split]["total"] = torch.LongTensor( + num_elements, self.max_event + self.max_effect).fill_(0) + + for i, seq in enumerate(sequences[split]): + # print(self.sequences[split]["total"][i, :len(seq[0])].size()) + # print(torch.FloatTensor(seq[0]).size()) + self.sequences[split]["total"][i, :len(seq[0])] = \ + torch.LongTensor(seq[0]) + self.sequences[split]["total"][i, self.max_event:self.max_event + len(seq[1])] = \ + torch.LongTensor(seq[1]) + + def sample_batch(self, split, bs, idxs=None): + offset = self.offsets[split]["total"] + + batch = {} + + # Decided not to reduce computation on here because it's all parallel + # anyway and we don't want to run out of memory in cases where we + # don't see the longest version quickly enough + + if idxs: + seqs = self.sequences[split]["total"].index_select( + 0, torch.LongTensor(idxs).to( + self.sequences[split]["total"].device)) + else: + seqs = self.sequences[split]["total"][offset:offset + bs] + batch["sequences"] = seqs.to(cfg.device) + batch["attention_mask"] = make_attention_mask(seqs) + batch["loss_mask"] = make_loss_mask( + seqs, self.max_event, 1) + batch["key"] = ("total", offset, offset + bs) + + offset += seqs.size(0) + + self.offsets[split]["total"] = offset + + if split == "train" and offset + bs > len(self.sequences[split]["total"]): + return batch, True + elif offset >= len(self.sequences[split]["total"]): + return batch, True + else: + return batch, False + + def reset_offsets(self, splits=["train", "test", "dev"], + shuffle=True, keys=None): + if isinstance(splits, str): + splits = [splits] + + for split in splits: + if keys is None: + keys = ["total"] + + for key in keys: + self.offsets[split][key] = 0 + + if shuffle: + self.shuffle_sequences(split, keys) + + def shuffle_sequences(self, split="train", keys=None): + if keys is None: + # print(type(self.data)) + # print(type(self.data.keys())) + keys = self.data[split].keys() + + for key in keys: + idxs = list(range(len(self.data[split][key]))) + + random.shuffle(idxs) + + self.sequences[split][key] = \ + self.sequences[split][key].index_select( + 0, torch.LongTensor(idxs)) + + temp = [self.data[split][key][i] for i in idxs] + self.data[split][key] = temp + temp = [self.masks[split][key][i] for i in idxs] + self.masks[split][key] = temp + + +def prune_data_for_evaluation(data_loader, categories, split): + indices = [] + for i, example in enumerate(data_loader.data[split]["total"]): + if example[1] in categories: + indices.append(i) + + data_loader.masks[split]["total"] = [data_loader.masks[split]["total"][i] + for i in indices] + data_loader.sequences[split]["total"] = \ + data_loader.sequences[split]["total"].index_select( + 0, torch.LongTensor(indices)) + data_loader.data[split]["total"] = [data_loader.data[split]["total"][i] + for i in indices] + + +def make_attention_mask(sequences): + return (sequences != 0).float().to(cfg.device) + + +def make_loss_mask(sequences, max_event, num_delim_tokens): + # print(num_delim_tokens) + # print(sequences.size()) + mask = (sequences != 0).float() + mask[:, :max_event + num_delim_tokens] = 0 + return mask[:, 1:].to(cfg.device) + + +def find_underscore_length(seq): + start = "_" + + while start in seq: + start += "_" + return start[:-1] + + +def handle_underscores(suffix, text_encoder, prefix=False): + encoder = text_encoder.encoder + if prefix: + tok = "___" + else: + tok = find_underscore_length(suffix) + + suffix_parts = [i.strip() for i in suffix.split("{}".format(tok))] + to_flatten = [] + for i, part in enumerate(suffix_parts): + if part: + to_flatten.append(text_encoder.encode([part], verbose=False)[0]) + + if i != len(suffix_parts) - 1 and suffix_parts[i+1]: + to_flatten.append([encoder[""]]) + else: + to_flatten.append([encoder[""]]) + + final_suffix = utils.flatten(to_flatten) + + return final_suffix + + +def get_generation_sequences(opt, data, split, text_encoder, test): + sequences = [] + count = 0 + + final_prefix = None + final_suffix = None + + for prefix, category, suffix in tqdm(data[split]["total"]): + final_prefix, final_suffix = do_example( + text_encoder, prefix, suffix, True, True) + # if do_prefix: + # if "___" in prefix: + # final_prefix = handle_underscores(prefix, text_encoder, True) + # else: + # final_prefix = text_encoder.encode([prefix], verbose=False)[0] + # if do_suffix: + # if "_" in suffix: + # final_suffix = handle_underscores(suffix, text_encoder) + # else: + # final_suffix = text_encoder.encode([suffix], verbose=False)[0] + + final = compile_final_sequence( + opt, final_prefix, final_suffix, category, text_encoder) + + sequences.append(final) + + count += 1 + + if count > 10 and test: + break + + return sequences + + + +def do_example(text_encoder, prefix, suffix, do_prefix, do_suffix): + final_prefix = None + final_suffix = None + + if do_prefix: + if "___" in prefix: + final_prefix = handle_underscores(prefix, text_encoder, True) + else: + final_prefix = text_encoder.encode([prefix], verbose=False)[0] + if do_suffix: + if "_" in suffix: + final_suffix = handle_underscores(suffix, text_encoder) + else: + final_suffix = text_encoder.encode([suffix], verbose=False)[0] + + return final_prefix, final_suffix + + +def compile_final_sequence(opt, final_prefix, final_suffix, category, text_encoder): + final = [] + + final.append(final_prefix) + final.append( + [text_encoder.encoder[category]] + + final_suffix) + + final[-1].append(text_encoder.encoder[""]) + + return final + + +num_delimiter_tokens = { + "category": 1, + "hierarchy": 3, + "hierarchy+label": 4, + "category+hierarchy": 4, + "category+hierarchy+label": 5 +} diff --git a/Model/COSMIC/feature_extraction/comet/src/data/conceptnet.py b/Model/COSMIC/feature_extraction/comet/src/data/conceptnet.py new file mode 100644 index 0000000000000000000000000000000000000000..872df788249667bf721a91d3460891a4047a718f --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/data/conceptnet.py @@ -0,0 +1,342 @@ +import comet.src.data.utils as data_utils +import comet.src.data.atomic as adata +import comet.src.data.config as cfg + +import torch +import random +from tqdm import tqdm + + +def map_name(name, opt): + if name == "train": + return "train{}k.txt".format(opt.trainsize) + elif name == "test": + return "test.txt" + else: + return "dev{}.txt".format(opt.devversion) + + +conceptnet_relations = [ + 'AtLocation', 'CapableOf', 'Causes', 'CausesDesire', + 'CreatedBy', 'DefinedAs', 'DesireOf', 'Desires', 'HasA', + 'HasFirstSubevent', 'HasLastSubevent', 'HasPainCharacter', + 'HasPainIntensity', 'HasPrerequisite', 'HasProperty', + 'HasSubevent', 'InheritsFrom', 'InstanceOf', 'IsA', + 'LocatedNear', 'LocationOfAction', 'MadeOf', 'MotivatedByGoal', + 'NotCapableOf', 'NotDesires', 'NotHasA', 'NotHasProperty', + 'NotIsA', 'NotMadeOf', 'PartOf', 'ReceivesAction', 'RelatedTo', + 'SymbolOf', 'UsedFor' +] + + +split_into_words = { + 'AtLocation': "at location", + 'CapableOf': "capable of", + 'Causes': "causes", + 'CausesDesire': "causes desire", + 'CreatedBy': "created by", + 'DefinedAs': "defined as", + 'DesireOf': "desire of", + 'Desires': "desires", + 'HasA': "has a", + 'HasFirstSubevent': "has first subevent", + 'HasLastSubevent': "has last subevent", + 'HasPainCharacter': "has pain character", + 'HasPainIntensity': "has pain intensity", + 'HasPrerequisite': "has prequisite", + 'HasProperty': "has property", + 'HasSubevent': "has subevent", + 'InheritsFrom': "inherits from", + 'InstanceOf': 'instance of', + 'IsA': "is a", + 'LocatedNear': "located near", + 'LocationOfAction': "location of action", + 'MadeOf': "made of", + 'MotivatedByGoal': "motivated by goal", + 'NotCapableOf': "not capable of", + 'NotDesires': "not desires", + 'NotHasA': "not has a", + 'NotHasProperty': "not has property", + 'NotIsA': "not is a", + 'NotMadeOf': "not made of", + 'PartOf': "part of", + 'ReceivesAction': "receives action", + 'RelatedTo': "related to", + 'SymbolOf': "symbol of", + 'UsedFor': "used for" +} + + +class GenerationDataLoader(adata.DataLoader): + def __init__(self, opt, categories=None): + super(GenerationDataLoader, self).__init__(opt) + self.opt = opt + + for split in self.data: + self.data[split] = {"total": []} + self.offsets[split] = {"total": 0} + + self.vocab_encoder = None + self.vocab_decoder = None + self.special_chars = None + self.max_e1 = None + self.max_e2 = None + self.max_r = None + + def offset_summary(self, split): + return sum(self.offsets[split].values()) + + def load_data(self, path): + if ".pickle" in path: + print("Loading data from: {}".format(path)) + data_utils.load_existing_data_loader(self, path) + return True + + for split in self.data: + file_name = map_name(split, self.opt.data) + + if split != "dev" or self.opt.data.devversion != "12": + string_tuples = open("{}/{}".format( + path, file_name), "r").read().split("\n") + tuples = [x.split("\t") for x in string_tuples if x] + else: + string_tuples = open("{}/{}".format( + path, "dev1.txt"), "r").read().split("\n") + tuples = [x.split("\t") for x in string_tuples if x] + string_tuples = open("{}/{}".format( + path, "dev2.txt"), "r").read().split("\n") + tuples += [x.split("\t") for x in string_tuples if x] + + if split in ["dev", "test"]: + if self.opt.data.rel == "language": + self.data[split]["total"] = \ + [(i[1].lower().strip(), split_into_words[i[0]], + i[2].lower().strip(), int(i[3])) for i in tuples] + self.data[split]["positive"] = \ + [(i[1].lower().strip(), split_into_words[i[0]], + i[2].lower().strip(), int(i[3])) for i in tuples if int(i[3])] + self.data[split]["negative"] = \ + [(i[1].lower().strip(), split_into_words[i[0]], + i[2].lower().strip(), int(i[3])) for i in tuples if not int(i[3])] + elif self.opt.data.rel == "relation": + self.data[split]["total"] = \ + [(i[1].lower().strip(), "<{}>".format(i[0]), + i[2].lower().strip(), int(i[3])) for i in tuples] + self.data[split]["positive"] = \ + [(i[1].lower().strip(), "<{}>".format(i[0]), + i[2].lower().strip(), int(i[3])) for i in tuples if int(i[3])] + self.data[split]["negative"] = \ + [(i[1].lower().strip(), "<{}>".format(i[0]), + i[2].lower().strip(), int(i[3])) for i in tuples if not int(i[3])] + else: + if self.opt.data.rel == "language": + self.data[split]["total"] = \ + [(i[1].lower().strip(), split_into_words[i[0]], + i[2].lower().strip(), i[3]) for i in tuples] + elif self.opt.data.rel == "relation": + self.data[split]["total"] = \ + [(i[1].lower().strip(), "<{}>".format(i[0]), + i[2].lower().strip(), i[3]) for i in tuples] + + return False + + def make_tensors(self, text_encoder, special, + splits=["train", "dev", "test"], test=False): + self.vocab_encoder = text_encoder.encoder + self.vocab_decoder = text_encoder.decoder + self.special_chars = special + + sequences = {} + for split in splits: + sequences[split], discarded = get_generation_sequences( + self.data, split, text_encoder, test, self.opt.data.maxe1, + self.opt.data.maxe2) + + if split == "train": + self.data[split]["total"] = [j for i, j in enumerate( + self.data[split]["total"]) if i not in set(discarded)] + self.masks[split]["total"] = [(len(i[0]), len(i[1]), len(i[2])) for + i in sequences[split]] + + self.max_e1 = max([max([l[0] for l in self.masks[split]["total"]]) + for split in self.masks]) + self.max_r = max([max([l[1] for l in self.masks[split]["total"]]) + for split in self.masks]) + self.max_e2 = max([max([l[2] for l in self.masks[split]["total"]]) + for split in self.masks]) + + print(self.max_e1) + print(self.max_r) + print(self.max_e2) + + for split in splits: + num_elements = len(sequences[split]) + self.sequences[split]["total"] = torch.LongTensor( + num_elements, self.max_e1 + self.max_e2 + self.max_r).fill_(0) + + for i, seq in enumerate(sequences[split]): + # print(self.sequences[split]["total"][i, :len(seq[0])].size()) + # print(torch.FloatTensor(seq[0]).size()) + self.sequences[split]["total"][i, :len(seq[0])] = \ + torch.LongTensor(seq[0]) + start_r = self.max_e1 + end_r = self.max_e1 + len(seq[1]) + self.sequences[split]["total"][i, start_r:end_r] = \ + torch.LongTensor(seq[1]) + start_e2 = self.max_e1 + self.max_r + end_e2 = self.max_e1 + self.max_r + len(seq[2]) + self.sequences[split]["total"][i, start_e2:end_e2] = \ + torch.LongTensor(seq[2]) + + if split in ["test", "dev"]: + print(split) + self.sequences[split]["negative"] = \ + self.sequences[split]["total"].index_select( + 0, torch.LongTensor([i for i, j in enumerate( + self.data[split]['total']) if not j[3]])) + # self.data[split]['total'][:self.sequences[split]["total"].size(0)]) if not j[3]])) + self.sequences[split]["positive"] = \ + self.sequences[split]["total"].index_select( + 0, torch.LongTensor([i for i, j in enumerate( + self.data[split]['total']) if j[3]])) + # self.data[split]['total'][:self.sequences[split]["total"].size(0)]) if j[3]])) + + def sample_batch(self, split, bs, cat="total", idxs=None): + offset = self.offsets[split][cat] + + batch = {} + + # Decided not to reduce computation on here because it's all parallel + # anyway and we don't want to run out of memory in cases where we + # don't see the longest version quickly enough + + if idxs: + seqs = self.sequences[split][cat].index_select( + 0, torch.LongTensor(idxs).to( + self.sequences[split][cat].device)) + else: + seqs = self.sequences[split][cat][offset:offset + bs] + batch["sequences"] = seqs.to(cfg.device) + batch["attention_mask"] = make_attention_mask(seqs) + batch["loss_mask"] = make_loss_mask(seqs, self.max_e1 + self.max_r) + batch["key"] = (cat, offset, offset + bs) + + offset += seqs.size(0) + + self.offsets[split][cat] = offset + + if split == "train" and offset + bs > len(self.sequences[split][cat]): + return batch, True + elif offset >= len(self.sequences[split][cat]): + return batch, True + else: + return batch, False + + def reset_offsets(self, splits=["train", "test", "dev"], + shuffle=True, keys=None): + if isinstance(splits, str): + splits = [splits] + + for split in splits: + if keys is None: + keys = ["total", "positive", "negative"] + + for key in keys: + self.offsets[split][key] = 0 + + if shuffle: + self.shuffle_sequences(split, keys) + + def shuffle_sequences(self, split="train", keys=None): + if keys is None: + # print(type(self.data)) + # print(type(self.data.keys())) + keys = self.data[split].keys() + + for key in keys: + if key in ["positive", "negative"]: + continue + idxs = list(range(len(self.data[split][key]))) + + random.shuffle(idxs) + + self.sequences[split][key] = \ + self.sequences[split][key].index_select( + 0, torch.LongTensor(idxs)) + + temp = [self.data[split][key][i] for i in idxs] + self.data[split][key] = temp + + temp = [self.masks[split][key][i] for i in idxs] + self.masks[split][key] = temp + + +def make_attention_mask(sequences): + return (sequences != 0).float().to(cfg.device) + + +def make_loss_mask(sequences, max_event): + # print(sequences.size()) + mask = (sequences != 0).float() + mask[:, :max_event] = 0 + return mask[:, 1:].to(cfg.device) + + +def get_generation_sequences(data, split, text_encoder, test, + max_e1=10, max_e2=15): + sequences = [] + count = 0 + + final_event1 = None + final_event2 = None + final_relation = None + + discarded = [] + + for event1, relation, event2, _ in tqdm(data[split]["total"]): + e1, r, e2 = do_example(text_encoder, event1, relation, event2) + + if (split == "train" and len(e1) > max_e1 or + len(e2) > max_e2): + discarded.append(count) + count += 1 + continue + + final = compile_final_sequence( + e1, e2, r, text_encoder) + + sequences.append(final) + + count += 1 + + if count > 10 and test: + break + + return sequences, discarded + + +def do_example(text_encoder, event1, relation, event2): + final_event1 = text_encoder.encode([event1], verbose=False)[0] + if relation.lower() != relation: + final_relation = [text_encoder.encoder[relation]] + else: + final_relation = text_encoder.encode( + [relation], verbose=False)[0] + if event2 is not None: + final_event2 = text_encoder.encode([event2], verbose=False)[0] + else: + final_event2 = None + + return final_event1, final_relation, final_event2 + + +def compile_final_sequence(final_event1, final_event2, final_relation, text_encoder): + final = [] + + final.append(final_event1) + final.append(final_relation) + final.append(final_event2) + + final[-1].append(text_encoder.encoder[""]) + + return final diff --git a/Model/COSMIC/feature_extraction/comet/src/data/config.py b/Model/COSMIC/feature_extraction/comet/src/data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..df17e9aa5edcd18ccf491f03220a90247f5f6bb9 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/data/config.py @@ -0,0 +1,186 @@ +import json +from comet.utils.utils import DD + +device = "cpu" + +save = False +test_save = False +toy = False +do_gen = False + +save_strategy = "all" + + +def get_parameters(opt, exp_type="model"): + params = DD() + params.net = DD() + + params.mle = 0 + params.dataset = opt.dataset + + params.net = get_net_parameters(opt) + params.train = get_training_parameters(opt) + + params.model = params.net.model + params.exp = opt.exp + + params.data = get_data_parameters(opt, params.exp, params.dataset) + params.eval = get_eval_parameters(opt, params.data.get("categories", None)) + + meta = DD() + + params.trainer = opt.trainer + + meta.iterations = int(opt.iterations) + meta.cycle = opt.cycle + params.cycle = opt.cycle + params.iters = int(opt.iterations) + + global toy + toy = opt.toy + + global do_gen + do_gen = opt.do_gen + + global save + save = opt.save + + global test_save + test_save = opt.test_save + + global save_strategy + save_strategy = opt.save_strategy + + print(params) + return params, meta + + +def get_eval_parameters(opt, force_categories=None): + evaluate = DD() + + if opt.eval_sampler == "beam": + evaluate.bs = opt.beam_size + elif opt.eval_sampler == "greedy": + evaluate.bs = 1 + elif opt.eval_sampler == "topk": + evaluate.k = opt.topk_size + + evaluate.smax = opt.gen_seqlength + evaluate.sample = opt.eval_sampler + + evaluate.numseq = opt.num_sequences + + evaluate.gs = opt.generate_sequences + evaluate.es = opt.evaluate_sequences + + if opt.dataset == "atomic": + if "eval_categories" in opt and force_categories is None: + evaluate.categories = opt.eval_categories + else: + evaluate.categories = force_categories + + return evaluate + + +def get_data_parameters(opt, experiment, dataset): + data = DD() + if dataset == "atomic": + data.categories = sorted(opt.categories) + # hard-coded + data.maxe1 = 17 + data.maxe2 = 35 + data.maxr = 1 + + elif dataset == "conceptnet": + data.rel = opt.relation_format + data.trainsize = opt.training_set_size + data.devversion = opt.development_set_versions_to_use + data.maxe1 = opt.max_event_1_size + data.maxe2 = opt.max_event_2_size + if data.rel == "language": + # hard-coded + data.maxr = 5 + else: + # hard-coded + data.maxr = 1 + + return data + + +def get_training_parameters(opt): + train = DD() + static = DD() + static.exp = opt.exp + + static.seed = opt.random_seed + + # weight decay + static.l2 = opt.l2 + static.vl2 = True + static.lrsched = opt.learning_rate_schedule # 'warmup_linear' + static.lrwarm = opt.learning_rate_warmup # 0.002 + + # gradient clipping + static.clip = opt.clip + + # what loss function to use + static.loss = opt.loss + + dynamic = DD() + dynamic.lr = opt.learning_rate # learning rate + dynamic.bs = opt.batch_size # batch size + # optimizer to use {adam, rmsprop, etc.} + dynamic.optim = opt.optimizer + + # rmsprop + # alpha is interpolation average + + static.update(opt[dynamic.optim]) + + train.static = static + train.dynamic = dynamic + + return train + + +def get_net_parameters(opt): + net = DD() + net.model = opt.model + net.nL = opt.num_layers + net.nH = opt.num_heads + net.hSize = opt.hidden_dim + net.edpt = opt.embedding_dropout + net.adpt = opt.attention_dropout + net.rdpt = opt.residual_dropout + net.odpt = opt.output_dropout + net.pt = opt.pretrain + net.afn = opt.activation + + # how to intialize parameters + # format is gauss+{}+{}.format(mean, std) + # n = the default initialization pytorch + net.init = opt.init + + return net + + +def read_config(file_): + config = DD() + print(file_) + for k, v in file_.items(): + if v == "True" or v == "T" or v == "true": + config[k] = True + elif v == "False" or v == "F" or v == "false": + config[k] = False + elif type(v) == dict: + config[k] = read_config(v) + else: + config[k] = v + + return config + + +def load_config(name): + with open(name, "r") as f: + config = json.load(f) + return config diff --git a/Model/COSMIC/feature_extraction/comet/src/data/data.py b/Model/COSMIC/feature_extraction/comet/src/data/data.py new file mode 100644 index 0000000000000000000000000000000000000000..69f908dc3fa7801b794f2680e0066f04b2e1244b --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/data/data.py @@ -0,0 +1,85 @@ +import os +import comet.src.data.atomic as atomic_data +import comet.src.data.conceptnet as conceptnet_data +import comet.src.data.config as cfg + +import comet.utils.utils as utils + +import pickle +import torch +import json + + +start_token = "" +end_token = "" +blank_token = "" + + +def save_checkpoint(state, filename): + print("Saving model to {}".format(filename)) + torch.save(state, filename) + + +def save_step(model, vocab, optimizer, opt, length, lrs): + if cfg.test_save: + name = "{}.pickle".format(utils.make_name( + opt, prefix="garbage/models/", is_dir=False, eval_=True)) + else: + name = "{}.pickle".format(utils.make_name( + opt, prefix="models/", is_dir=False, eval_=True)) + save_checkpoint({ + "epoch": length, "state_dict": model.state_dict(), + "optimizer": optimizer.state_dict(), "opt": opt, + "vocab": vocab, "epoch_learning_rates": lrs}, + name) + + +def save_eval_file(opt, stats, eval_type="losses", split="dev", ext="pickle"): + if cfg.test_save: + name = "{}/{}.{}".format(utils.make_name( + opt, prefix="garbage/{}/".format(eval_type), + is_dir=True, eval_=True), split, ext) + else: + name = "{}/{}.{}".format(utils.make_name( + opt, prefix="results/{}/".format(eval_type), + is_dir=True, eval_=True), split, ext) + print("Saving {} {} to {}".format(split, eval_type, name)) + + if ext == "pickle": + with open(name, "wb") as f: + pickle.dump(stats, f) + elif ext == "txt": + with open(name, "w") as f: + f.write(stats) + elif ext == "json": + with open(name, "w") as f: + json.dump(stats, f) + else: + raise + + +def load_checkpoint(filename, gpu=True): + if os.path.exists(filename): + checkpoint = torch.load( + filename, map_location=lambda storage, loc: storage) + else: + print("No model found at {}".format(filename)) + return checkpoint + + +def make_data_loader(opt, *args): + if opt.dataset == "atomic": + return atomic_data.GenerationDataLoader(opt, *args) + elif opt.dataset == "conceptnet": + return conceptnet_data.GenerationDataLoader(opt, *args) + + +def set_max_sizes(data_loader, force_split=None): + data_loader.total_size = {} + if force_split is not None: + data_loader.total_size[force_split] = \ + data_loader.sequences[force_split]["total"].size(0) + return + for split in data_loader.sequences: + data_loader.total_size[split] = \ + data_loader.sequences[split]["total"].size(0) diff --git a/Model/COSMIC/feature_extraction/comet/src/data/utils.py b/Model/COSMIC/feature_extraction/comet/src/data/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce30f182d555dc13614a531e12a8b0cad2e91bc --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/data/utils.py @@ -0,0 +1,134 @@ +import re +import ftfy +import json +import spacy +import torch + +from tqdm import tqdm + + +def load_existing_data_loader(data_loader, path): + old_data_loader = torch.load(path) + for attr in data_loader.__dict__.keys(): + if attr not in old_data_loader.__dict__.keys(): + continue + setattr(data_loader, attr, getattr(old_data_loader, attr)) + + +################################################################################ +# +# Code Below taken from HuggingFace pytorch-openai-lm repository +# +################################################################################ + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + word is represented as tuple of symbols (symbols being variable-length strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def text_standardize(text): + """ + fixes some issues the spacy tokenizer had on books corpus + also does some whitespace standardization + """ + text = text.replace('—', '-') + text = text.replace('–', '-') + text = text.replace('―', '-') + text = text.replace('…', '...') + text = text.replace('´', "'") + text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) + text = re.sub(r'\s*\n\s*', ' \n ', text) + text = re.sub(r'[^\S\n]+', ' ', text) + return text.strip() + + +class TextEncoder(object): + """ + mostly a wrapper for a public python bpe tokenizer + """ + + def __init__(self, encoder_path, bpe_path): + self.nlp = spacy.load( + 'en_core_web_sm', disable=['parser', 'tagger', 'ner', 'textcat']) + self.encoder = json.load(open(encoder_path)) + self.decoder = {v: k for k, v in self.encoder.items()} + merges = open(bpe_path, encoding='utf-8').read().split('\n')[1:-1] + merges = [tuple(merge.split()) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + '',) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get( + pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if (word[i] == first and i < len(word) - 1 and + word[i+1] == second): + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + if word == '\n ': + word = '\n' + self.cache[token] = word + return word + + def encode(self, texts, verbose=True): + texts_tokens = [] + if verbose: + for text in tqdm(texts, ncols=80, leave=False): + text = self.nlp(text_standardize(ftfy.fix_text(text))) + text_tokens = [] + for token in text: + text_tokens.extend( + [self.encoder.get(t, 0) for t in + self.bpe(token.text.lower()).split(' ')]) + texts_tokens.append(text_tokens) + else: + for text in texts: + text = self.nlp(text_standardize(ftfy.fix_text(text))) + text_tokens = [] + for token in text: + text_tokens.extend( + [self.encoder.get(t, 0) for t in + self.bpe(token.text.lower()).split(' ')]) + texts_tokens.append(text_tokens) + return texts_tokens diff --git a/Model/COSMIC/feature_extraction/comet/src/evaluate/atomic_evaluate.py b/Model/COSMIC/feature_extraction/comet/src/evaluate/atomic_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..e22139b092b0326be36c95f8ed07a5eb15254b70 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/evaluate/atomic_evaluate.py @@ -0,0 +1,40 @@ +import comet.src.train.batch as batch +import comet.src.evaluate.evaluate as base_evaluate +import numpy as np + +def make_evaluator(opt, *args): + if opt.exp == "generation": + return AtomicGenerationEvaluator(opt, *args) + else: + return AtomicClassificationEvaluator(opt, *args) + + +class AtomicGenerationEvaluator(base_evaluate.Evaluator): + def __init__(self, opt, model, data_loader): + super(AtomicGenerationEvaluator, self).__init__( + opt, model, data_loader) + + self.batch = batch.batch_atomic_generate + + def initialize_losses(self): + average_loss = {"total_micro": 0, "total_macro": 0} + nums = {"total_micro": 0, "total_macro": 0} + return average_loss, nums + + def compute_final_scores(self, average_loss, nums): + average_loss["total_macro"] /= nums["total_macro"] + average_loss["total_micro"] /= nums["total_micro"] + + average_loss["ppl_macro"] = np.exp(average_loss["total_macro"]) + average_loss["ppl_micro"] = np.exp(average_loss["total_micro"]) + + return average_loss + + def counter(self, nums): + return nums["total_macro"] + + def print_result(self, split, epoch_losses): + print("{} Loss: \t {}".format( + split, epoch_losses["total_micro"])) + print("{} Perplexity: \t {}".format( + split, epoch_losses["ppl_micro"])) diff --git a/Model/COSMIC/feature_extraction/comet/src/evaluate/conceptnet_evaluate.py b/Model/COSMIC/feature_extraction/comet/src/evaluate/conceptnet_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..197506aeacef6c1b77f7f9a51a36ca3f95cd85e1 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/evaluate/conceptnet_evaluate.py @@ -0,0 +1,82 @@ +import time +import numpy as np + +import comet.src.train.batch as batch_utils +import comet.utils.utils as utils +import comet.src.evaluate.evaluate as base_evaluate + + +def make_evaluator(opt, *args, **kwargs): + return ConceptNetGenerationEvaluator(opt, *args, **kwargs) + + +class ConceptNetGenerationEvaluator(base_evaluate.Evaluator): + def __init__(self, opt, model, data_loader, track=False): + super(ConceptNetGenerationEvaluator, self).__init__( + opt, model, data_loader) + + if track: + self.tracker = {"positive": [], "negative": []} + else: + self.tracker = None + + def batch(self, opt, nums, average_loss, batch_variables, eval_mode): + batch_variables["category"] = self.current_category + + outputs = batch_utils.batch_conceptnet_generate( + opt, nums, average_loss, batch_variables, eval_mode, + tracking_mode=self.tracker is not None) + + if outputs.get("tracking", None) is not None: + self.tracker[self.current_category] += outputs["tracking"] + + if outputs["reset"] and batch_variables["category"] == "positive": + outputs["reset"] = False + self.current_category = "negative" + + return outputs + + def initialize_losses(self): + average_loss = {"total_micro": 0, "total_macro": 0, + "negative_micro": 0, "negative_macro": 0} + nums = {"total_micro": 0, "total_macro": 0, + "negative_micro": 0, "negative_macro": 0} + + self.current_category = "positive" + + if self.tracker is not None: + self.tracker = {"positive": [], "negative": []} + + return average_loss, nums + + def compute_final_scores(self, average_loss, nums): + average_loss["total_macro"] /= nums["total_macro"] + average_loss["total_micro"] /= nums["total_micro"] + + if nums["negative_micro"]: + average_loss["negative_macro"] /= nums["negative_macro"] + average_loss["negative_micro"] /= nums["negative_micro"] + else: + average_loss["negative_macro"] = 0 + average_loss["negative_micro"] = 0 + + average_loss["macro_diff"] = (average_loss["negative_macro"] - + average_loss["total_macro"]) + average_loss["micro_diff"] = (average_loss["negative_micro"] - + average_loss["total_micro"]) + + average_loss["ppl_macro"] = np.exp(average_loss["total_macro"]) + average_loss["ppl_micro"] = np.exp(average_loss["total_micro"]) + + return average_loss + + def counter(self, nums): + return nums["total_macro"] + + def print_result(self, split, epoch_losses): + print("{} Loss: \t {}".format( + split, epoch_losses["total_micro"])) + print("{} Diff: \t {}".format( + split, epoch_losses["micro_diff"])) + print("{} Perplexity: \t {}".format( + split, epoch_losses["ppl_micro"])) diff --git a/Model/COSMIC/feature_extraction/comet/src/evaluate/conceptnet_generate.py b/Model/COSMIC/feature_extraction/comet/src/evaluate/conceptnet_generate.py new file mode 100644 index 0000000000000000000000000000000000000000..3e4302416cb7548e790ad5abd8e9339c5333629b --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/evaluate/conceptnet_generate.py @@ -0,0 +1,112 @@ +import time +import torch + +import comet.src.evaluate.generate as base_generate +import comet.src.evaluate.sampler as sampling +import comet.utils.utils as utils +import comet.src.data.config as cfg + + +def make_generator(opt, *args): + return ConceptNetGenerator(opt, *args) + + +class ConceptNetGenerator(base_generate.Generator): + def __init__(self, opt, model, data_loader): + self.opt = opt + + self.model = model + self.data_loader = data_loader + + self.sampler = sampling.make_sampler( + opt.eval.sample, opt, data_loader) + + def reset_sequences(self): + return [] + + def generate(self, split="dev"): + print("Generating Sequences") + + # Set evaluation mode + self.model.eval() + + # Reset evaluation set for dataset split + self.data_loader.reset_offsets(splits=split, shuffle=False) + + start = time.time() + count = 0 + sequences = None + + # Reset generated sequence buffer + sequences = self.reset_sequences() + + # Initialize progress bar + bar = utils.set_progress_bar( + self.data_loader.total_size[split] / 2) + + reset = False + + with torch.no_grad(): + # Cycle through development set + while not reset: + + start = len(sequences) + # Generate a single batch + reset = self.generate_batch(sequences, split, bs=1) + + end = len(sequences) + + if not reset: + bar.update(end - start) + else: + print(end) + + count += 1 + + if cfg.toy and count > 10: + break + if (self.opt.eval.gs != "full" and (count > opt.eval.gs)): + break + + torch.cuda.synchronize() + print("{} generations completed in: {} s".format( + split, time.time() - start)) + + # Compute scores for sequences (e.g., BLEU, ROUGE) + # Computes scores that the generator is initialized with + # Change define_scorers to add more scorers as possibilities + # avg_scores, indiv_scores = self.compute_sequence_scores( + # sequences, split) + avg_scores, indiv_scores = None, None + + return sequences, avg_scores, indiv_scores + + def generate_batch(self, sequences, split, verbose=False, bs=1): + # Sample batch from data loader + batch, reset = self.data_loader.sample_batch( + split, bs=bs, cat="positive") + + start_idx = self.data_loader.max_e1 + self.data_loader.max_r + max_end_len = self.data_loader.max_e2 + + context = batch["sequences"][:, :start_idx] + reference = batch["sequences"][:, start_idx:] + init = "".join([self.data_loader.vocab_decoder[i].replace( + '', ' ') for i in context[:, :self.data_loader.max_e1].squeeze().tolist() if i]).strip() + + start = self.data_loader.max_e1 + end = self.data_loader.max_e1 + self.data_loader.max_r + + attr = "".join([self.data_loader.vocab_decoder[i].replace( + '', ' ') for i in context[:, start:end].squeeze(0).tolist() if i]).strip() + + # Decode sequence + sampling_result = self.sampler.generate_sequence( + batch, self.model, self.data_loader, start_idx, max_end_len) + + sampling_result["key"] = batch["key"] + sampling_result["e1"] = init + sampling_result["r"] = attr + sequences.append(sampling_result) + + return reset diff --git a/Model/COSMIC/feature_extraction/comet/src/evaluate/evaluate.py b/Model/COSMIC/feature_extraction/comet/src/evaluate/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..9a761a820266a410716b42f171ed61d93f0e3ce5 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/evaluate/evaluate.py @@ -0,0 +1,85 @@ +import time +import torch + +import comet.utils.utils as utils +import comet.src.data.config as cfg + + +class Evaluator(object): + def __init__(self, opt, model, data_loader): + super(Evaluator, self).__init__() + + self.data_loader = data_loader + self.model = model + + self.batch_variables = { + "model": model, + "data": data_loader + } + + self.opt = opt + + def validate(self, l, split="dev", losses={}, keyset=None): + self.batch_variables["split"] = split + print("Evaluating {}".format(split)) + + epoch_losses = self.epoch( + self.opt, self.model, self.data_loader, split, keyset) + + self.print_result(split, epoch_losses) + + for loss_name, loss_val in epoch_losses.items(): + losses.setdefault(loss_name, {}) + losses[loss_name][l] = loss_val + + def epoch(self, opt, model, data_loader, split, keyset=None): + average_loss, nums = self.initialize_losses() + + data_loader.reset_offsets(splits=split, shuffle=False) + + # Set evaluation mode + model.eval() + + start = time.time() + + # Initialize progress bar + bar = utils.set_progress_bar( + data_loader.total_size[split]) + + reset = False + + with torch.no_grad(): + while not reset: + + start = data_loader.offset_summary(split) + + outputs = self.batch( + opt, nums, average_loss, + self.batch_variables, eval_mode=True) + + end = data_loader.offset_summary(split) + + reset = outputs["reset"] + + if not reset: + bar.update(end - start) + else: + print(end) + + if cfg.toy and self.counter(nums) > 100: + break + if (opt.eval.es != "full" and + (self.counter(nums) > opt.eval.es)): + break + + nums = outputs["nums"] + + torch.cuda.synchronize() + + print("{} evaluation completed in: {} s".format( + split.capitalize(), time.time() - start)) + + average_loss = self.compute_final_scores( + average_loss, nums) + + return average_loss diff --git a/Model/COSMIC/feature_extraction/comet/src/evaluate/generate.py b/Model/COSMIC/feature_extraction/comet/src/evaluate/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..cc1d2830b03ad58eeb3f82d6d3bd2f9246e359fd --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/evaluate/generate.py @@ -0,0 +1,72 @@ +import comet.src.data.data as data +import comet.src.data.config as cfg +import comet.src.evaluate.sampler as sampling + + +def do_gen_run(opt, generator, l, split="dev", scores={}): + # Generate sequences for examples in evaluation set using + # current trained model + + if opt.eval.gs == "full": + sequences, avg_scores, indiv_scores = generator.generate(split) + else: + sequences, avg_scores, indiv_scores = generator.generate_some(split) + + if avg_scores is not None: + # Record scores from generated sequences + for score_name, score_val in avg_scores.items(): + scores.setdefault(score_name, {}) + scores[score_name].setdefault(l, []) + scores[score_name][l] += [score_val] + + # Save generated sequences + save_sequences(opt, sequences, avg_scores, indiv_scores, + l, split, opt.eval.gs == "full", + generator.data_loader) + + +def save_sequences(opt, sequences, avg_scores, indiv_scores, + l, split, full, data_loader): + # This seems a bit roundabout since l = opt.train.dynamic in train.py + # But it's in case we start checkpointing outside of epoch boundaries + opt.train.dynamic.epoch = l + + if cfg.save: + if full: + names = {"gens": "gens", "scores": "scores", + "indiv": "indiv.scores"} + else: + names = {"gens": "gens.small", "scores": "scores.small", + "indiv": "indiv.scores.small"} + # Save generated sequences + data.save_eval_file(opt, sequences, names["gens"], split) + + if avg_scores is not None: + # Save average scores over evaluation set for generated sequences + # Scores computed are the ones the generator was initialized with + data.save_eval_file(opt, avg_scores, names["scores"], split) + + if split == "dev": + # Save individual scores + data.save_eval_file( + opt, indiv_scores, names["indiv"], split) + + +class Generator(object): + def __init__(self, opt, model, data_loader, scorers, reward_function=None): + super(Generator, self).__init__() + self.opt = opt + + self.model = model + self.data_loader = data_loader + + self.sampler = sampling.make_sampler( + opt.eval.sample, opt, data_loader) + + + def generate(self, split="dev"): + pass + + def generate_batch(self, sequences, split, verbose=False, bs=32): + pass + diff --git a/Model/COSMIC/feature_extraction/comet/src/evaluate/sampler.py b/Model/COSMIC/feature_extraction/comet/src/evaluate/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..518a641549ea1e1baabc200634531641dc473002 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/evaluate/sampler.py @@ -0,0 +1,329 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comet.src.data.data as data +import comet.src.data.config as cfg +import comet.src.models.utils as model_utils +import comet.src.evaluate.utils as eval_utils +import comet.src.train.batch as batch_utils + +def make_sampler(sampler_type, opt, *args, **kwargs): + print("Initializing Greedy Sampler") + return GreedySampler(opt, *args, **kwargs) + +class Sampler(): + def __init__(self, opt, data_loader, batch_mode=False): + # Token on which to end sampling + self.end_token = data_loader.vocab_encoder[data.end_token] + + self.opt = opt + + def generate_sequence(self, batch, model): + raise + + +class GreedySampler(Sampler): + def __init__(self, opt, data_loader, batch_mode=True): + super(GreedySampler, self).__init__(opt, data_loader) + + def append_batch(self, X, next_idx, mask): + next_pos = X[:, -1:, 1] + 1 + next_x = torch.cat((next_idx, next_pos), -1).unsqueeze(1) + next_mask = torch.cat([mask, torch.ones(X.size(0), 1, device=mask.device)], 1) + return torch.cat((X, next_x), 1), next_mask + + def generate_sequence(self, batch, model, data_loader, start_idx, end_len): + XMB = batch["sequences"][:, :start_idx] + MMB = batch["attention_mask"][:, :start_idx] + + XMB = model_utils.prepare_position_embeddings( + self.opt, data_loader.vocab_encoder, XMB.unsqueeze(-1)) + + _, lp = model( + XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + + values, indices = lm_probs[:, -1, :].max(dim=-1) + seqs = indices.clone().unsqueeze(1) + + loss = values + counts = 1 + next_pos = XMB[:, -1:, 1] + 1 + next_x = torch.cat((indices.view(-1, 1), next_pos), -1).unsqueeze(1) + XMB = torch.cat((XMB, next_x), 1) + MMB = torch.cat([MMB, torch.ones(XMB.size(0), 1, device=MMB.device)], 1) + + # Sample from top k + + for _ in range(self.opt.eval.smax): + _, lp = model( + XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + + # Sample from top k + values, next_idx = lm_probs[:, -1, :].max(dim=-1) + + loss += values + counts += 1 + + next_idx = next_idx.unsqueeze(1) + + seqs = torch.cat([seqs, next_idx], 1) + + if (next_idx.item() == self.end_token) or (_ == end_len - 1): + break + + XMB, MMB = self.append_batch(XMB, next_idx, MMB) + + beams = [] + + for beam in seqs: + beams.append(" ".join("".join( + [data_loader.vocab_decoder[tok.item()].replace( + '', ' ').replace('\n', '') + for tok in beam if tok != self.end_token]).split())) + + sampling_result = { + "sequence": beams[0], + "beams": beams, + "beam_losses": [loss.item()], + "loss": loss.item(), + "beam_lengths": [counts], + "length": counts + } + + return sampling_result + + +class TopKSampler(Sampler): + def __init__(self, opt, data_loader, batch_mode=True): + super(TopKSampler, self).__init__(opt, data_loader) + + def append_batch(self, X, next_idx, mask): + next_pos = X[:, -1:, 1] + 1 + next_x = torch.cat((next_idx, next_pos), -1).unsqueeze(1) + next_mask = torch.cat([mask, torch.ones(X.size(0), 1, device=mask.device)], 1) + return torch.cat((X, next_x), 1), next_mask + + def generate_sequence(self, batch, model, data_loader, start_idx, end_len): + # start_idx = context_size_event + 1 + # start_idx = max_e1 + max_r + # end_idx = context_size_effect - 1 + # end_idx = max_e2 + XMB = batch["sequences"][:, :start_idx] + MMB = batch["attention_mask"][:, :start_idx] + + XMB = model_utils.prepare_position_embeddings( + self.opt, data_loader.vocab_encoder, XMB.unsqueeze(-1)) + + _, lp = model( + XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + + values, indices = lm_probs[:, -1, :].topk(self.opt.eval.k) + seqs = indices.t().clone() + + losses = - values.view(-1, 1) + + ended = (seqs == self.end_token).float() + counts = (1 - ended) + XMB = XMB.repeat(self.opt.eval.k, 1, 1) + MMB = MMB.repeat(self.opt.eval.k, 1) + next_pos = XMB[:, -1:, 1] + 1 + next_x = torch.cat((indices.view(self.opt.eval.k, -1), next_pos), -1).unsqueeze(1) + XMB = torch.cat((XMB, next_x), 1) + MMB = torch.cat([MMB, torch.ones(XMB.size(0), 1, device=MMB.device)], 1) + + # Sample from top k + + for _ in range(end_len): + _, lp = model(XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + + # Sample from top k + values, indices = lm_probs[:, -1, :].topk(self.opt.eval.k) + choice = torch.multinomial(values.exp(), 1) + next_idx = indices.gather(-1, choice) + + ended = ended + (next_idx == self.end_token).float() * (1 - ended) + + next_idx = next_idx * (1 - ended).long() + ended.long() * self.end_token + + counts += (1 - ended) + + seqs = torch.cat([seqs, next_idx], 1) + + if ended.sum().item() == self.opt.eval.k: + break + + losses -= values.gather(-1, choice) * (1 - ended) + + XMB, MMB = self.append_batch(XMB, next_idx, MMB) + + beams = [] + + for beam in seqs: + beams.append(" ".join("".join( + [data_loader.vocab_decoder[tok.item()].replace( + '', ' ').replace('\n', '') + for tok in beam if tok != self.end_token]).split())) + + sampling_result = { + "sequence": beams[0], + "beams": beams, + "beam_losses": losses.squeeze().tolist(), + "loss": losses[0].item(), + "beam_lengths": counts.long().squeeze().tolist(), + "length": counts[0].long().item() + } + + return sampling_result + + +class BeamSampler(TopKSampler): + def __init__(self, opt, data_loader, batch_mode=True, scorer=None): + super(BeamSampler, self).__init__(opt, data_loader, batch_mode) + + self.kill_mask = torch.ones(opt.eval.bs, opt.eval.bs).to(cfg.device) * 9000 + self.kill_mask[:, 0] = 0 + + def make_batch(self, X): + X = np.array(X) + assert X.ndim in [1, 2] + if X.ndim == 1: + X = np.expand_dims(X, axis=0) + pos_enc = np.arange(n_vocab + n_special, n_vocab + n_special + X.shape[-1]) + pos_enc = np.expand_dims(pos_enc, axis=0) + batch = np.stack([X, pos_enc], axis=-1) + batch = torch.tensor(batch, dtype=torch.long).to(device) + return batch + + def append_batch(self, X, beam_toks, mask): + next_pos = X[:, -1:, 1] + 1 + next_x = torch.cat((beam_toks.unsqueeze(1), next_pos), -1).unsqueeze(1) + next_mask = torch.cat([mask, torch.ones(X.size(0), 1, device=mask.device)], 1) + return torch.cat((X, next_x), 1), next_mask + + def generate_sequence(self, batch, model, data_loader, start_idx, end_len): + # start_idx = context_size_event + 1 + # start_idx = max_e1 + max_r + # end_idx = context_size_effect - 1 + # end_idx = max_e2 + XMB = batch["sequences"][:, :start_idx] + MMB = batch["attention_mask"][:, :start_idx] + + XMB = model_utils.prepare_position_embeddings( + self.opt, data_loader.vocab_encoder, XMB.unsqueeze(-1)) + + tokens = [] + beam_losses = [] + # Beam Search + beam_lls, beam_toks, beam_seqs = None, None, None + _, lp = model(XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + dist = lm_probs[:, -1, :].squeeze() + beam_lls, beam_toks = dist.topk(self.opt.eval.bs) + beam_losses.append(beam_lls) + + ended = (beam_toks == self.end_token).float() + counts = (2 - ended) + beam_toks = beam_toks.unsqueeze(1) + beam_seqs = beam_toks.clone() + XMB = XMB.repeat(self.opt.eval.bs, 1, 1) + MMB = MMB.repeat(self.opt.eval.bs, 1) + next_pos = XMB[:, -1:, 1] + 1 + next_x = torch.cat((beam_toks, next_pos), -1).unsqueeze(1) + XMB = torch.cat((XMB, next_x), 1) + MMB = torch.cat([MMB, torch.ones(XMB.size(0), 1, device=MMB.device)], 1) + + for _ in range(end_len): + + # Compute distribution for current beam + _, lp = model( + XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + dist = lm_probs[:, -1, :].squeeze() + + # get hypothesis tokens for distribution + hyp_beam_lls, hyp_beam_toks = dist.topk(self.opt.eval.bs) + + # Compute masks and expand beam + expanded_ended = ended.unsqueeze(1).repeat(1, self.opt.eval.bs) + hypothesis_mask = expanded_ended * self.kill_mask + (1 - expanded_ended) + + paper_results = False + + if paper_results: + # Results from paper with slightly buggy beam search + current_beam_lls = beam_lls.unsqueeze(1).repeat( + 1, self.opt.eval.bs).view(self.opt.eval.bs**2) + else: + # Current beam search implementation + current_beam_lls = beam_losses[-1].unsqueeze(1).repeat( + 1, self.opt.eval.bs).view(self.opt.eval.bs**2) + + # Compute losses of hypotheses, masking those that have ended + hyp_beam_lls = (hyp_beam_lls.view(self.opt.eval.bs**2) * + hypothesis_mask.view(-1)) + current_beam_lls + + # Get normalizer for sequences + temp_counts = counts.unsqueeze(1).repeat(1, self.opt.eval.bs).view( + self.opt.eval.bs ** 2) + + # Select best beams with lowest aggregate loss + beam_lls, top_beam_idxs = (hyp_beam_lls / temp_counts).topk(self.opt.eval.bs) + + # Update placements in beam based on selecetion + beam_losses = [i.index_select(0, top_beam_idxs // self.opt.eval.bs) + for i in beam_losses] + ended = ended.index_select(0, top_beam_idxs // self.opt.eval.bs) + counts = temp_counts.index_select(0, top_beam_idxs) + + # Save beam losses + beam_losses.append(beam_lls * counts) + + # Update beam tokens + ended_mask = (1 - ended).long() + end_replacement = (self.end_token * ended).long() + next_toks = hyp_beam_toks.view(-1)[top_beam_idxs] + beam_toks = next_toks * ended_mask + end_replacement + + # Update ended and counts + ended = ended + (beam_toks == self.end_token).float() * (1 - ended) + counts = counts + (1 - ended) + + # Update beam sequences + beam_seqs = beam_seqs.t().repeat(self.opt.eval.bs, 1).t().contiguous().view( + self.opt.eval.bs**2, -1)[top_beam_idxs] + beam_seqs = torch.cat((beam_seqs, beam_toks.unsqueeze(1)), dim=1) + + # I have no idea what's going on but Ari's on point with it + XMB = XMB.transpose(0, 1).transpose(1, 2).repeat( + self.opt.eval.bs, 1, 1).transpose(2, 1).transpose( + 1, 0).contiguous().view( + self.opt.eval.bs**2, XMB.size(1), XMB.size(2))[top_beam_idxs] + + XMB, MMB = self.append_batch(XMB, beam_toks, MMB) + + if (beam_toks == self.end_token).sum().item() == self.opt.eval.bs: + break + + beams = [] + + for beam in beam_seqs: + beams.append(" ".join("".join( + [data_loader.vocab_decoder[tok.item()].replace( + '', ' ').replace('\n', '') + for tok in beam if tok != self.end_token]).split())) + + sampling_result = { + "sequence": beams[0], + "beams": beams, + "beam_losses": beam_lls.tolist(), + "loss": beam_lls[0].item(), + "beam_lengths": counts.tolist(), + "length": counts[0].item() + } + + return sampling_result diff --git a/Model/COSMIC/feature_extraction/comet/src/evaluate/utils.py b/Model/COSMIC/feature_extraction/comet/src/evaluate/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca71e091988215268645a13dcb38448cd05cff1 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/evaluate/utils.py @@ -0,0 +1,39 @@ + +def update_classification_losses(losses, nums, name, bs, loss): + if not isinstance(loss, float): + print(type(loss)) + raise + + nums[name] += bs + + losses[name] += loss * bs + + +def update_generation_losses(losses, nums, micro, macro, bs, length, loss): + # Update Losses + nums[macro] += bs + + if isinstance(length, int): + update_indiv_generation_losses( + losses, nums, micro, macro, bs, length, loss) + else: + update_tensor_generation_losses( + losses, nums, micro, macro, bs, length, loss) + + +def update_indiv_generation_losses(losses, nums, micro, + macro, bs, length, loss): + nums[micro] += bs * length + + batch_loss = loss * bs + + losses[micro] += batch_loss + losses[macro] += batch_loss / length + + +def update_tensor_generation_losses(losses, nums, micro, + macro, bs, length, loss): + nums[micro] += length.sum().item() + + losses[micro] += loss.sum().item() + losses[macro] += (loss / length.float()).sum().item() diff --git a/Model/COSMIC/feature_extraction/comet/src/interactive/functions.py b/Model/COSMIC/feature_extraction/comet/src/interactive/functions.py new file mode 100644 index 0000000000000000000000000000000000000000..0d73cb5e354595f3df4facb995bd7983ec9f3163 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/interactive/functions.py @@ -0,0 +1,376 @@ +import os +import torch + +from comet.src.data.utils import TextEncoder +import comet.src.data.config as cfg +import comet.src.data.data as data +import comet.src.models.models as models +from comet.src.evaluate.sampler import BeamSampler, GreedySampler, TopKSampler +import comet.utils.utils as utils + + +def load_model_file(model_file): + model_stuff = data.load_checkpoint(model_file) + opt = model_stuff["opt"] + state_dict = model_stuff["state_dict"] + + return opt, state_dict + + +def load_data(dataset, opt, dir="."): + if dataset == "atomic": + data_loader = load_atomic_data(opt, dir) + elif dataset == "conceptnet": + data_loader = load_conceptnet_data(opt, dir) + + # Initialize TextEncoder + encoder_path = os.path.join(dir, "comet/model/encoder_bpe_40000.json") + bpe_path = os.path.join(dir, "comet/model/vocab_40000.bpe") + text_encoder = TextEncoder(encoder_path, bpe_path) + text_encoder.encoder = data_loader.vocab_encoder + text_encoder.decoder = data_loader.vocab_decoder + + return data_loader, text_encoder + + +def load_atomic_data(opt, dir="."): + # Hacky workaround, you may have to change this + # if your models use different pad lengths for e1, e2, r + if opt.data.get("maxe1", None) is None: + opt.data.maxe1 = 17 + opt.data.maxe2 = 35 + opt.data.maxr = 1 + + # temporarily change to the target directory + current_dir = os.getcwd() + os.chdir(dir) + + path = "comet/data/atomic/processed/generation/categories_oEffect#oReact#oWant#xAttr#xEffect#xIntent#xNeed#xReact#xWant-maxe1_17-maxe2_35-maxr_1.pickle" + data_loader = data.make_data_loader(opt, opt.data.categories) + loaded = data_loader.load_data(path) + + # go back to the original working directory + os.chdir(current_dir) + + return data_loader + + +def load_conceptnet_data(opt, dir="."): + # Hacky workaround, you may have to change this + # if your models use different pad lengths for r + if opt.data.get("maxr", None) is None: + if opt.data.rel == "language": + opt.data.maxr = 5 + else: + opt.data.maxr = 1 + + # temporarily change to the target directory + current_dir = os.getcwd() + os.chdir(dir) + + path = "comet/data/conceptnet/processed/generation/{}.pickle".format( + utils.make_name_string(opt.data) + ) + data_loader = data.make_data_loader(opt) + loaded = data_loader.load_data(path) + + # go back to the original working directory + os.chdir(current_dir) + + return data_loader + + +def make_model(opt, n_vocab, n_ctx, state_dict): + model = models.make_model( + opt, n_vocab, n_ctx, None, load=False, return_acts=True, return_probs=False + ) + + models.load_state_dict(model, state_dict) + + model.eval() + return model + + +def set_sampler(opt, sampling_algorithm, data_loader): + if "beam" in sampling_algorithm: + opt.eval.bs = int(sampling_algorithm.split("-")[1]) + sampler = BeamSampler(opt, data_loader) + elif "topk" in sampling_algorithm: + # print("Still bugs in the topk sampler. Use beam or greedy instead") + # raise NotImplementedError + opt.eval.k = int(sampling_algorithm.split("-")[1]) + sampler = TopKSampler(opt, data_loader) + else: + sampler = GreedySampler(opt, data_loader) + + return sampler + + +def get_atomic_sequence( + input_event, model, sampler, data_loader, text_encoder, category +): + if isinstance(category, list): + outputs = {} + for cat in category: + new_outputs = get_atomic_sequence( + input_event, model, sampler, data_loader, text_encoder, cat + ) + outputs.update(new_outputs) + return outputs + elif category == "all": + outputs = {} + + for category in data_loader.categories: + new_outputs = get_atomic_sequence( + input_event, model, sampler, data_loader, text_encoder, category + ) + outputs.update(new_outputs) + return outputs + else: + sequence_all = {} + + sequence_all["event"] = input_event + sequence_all["effect_type"] = category + + with torch.no_grad(): + batch = set_atomic_inputs(input_event, category, data_loader, text_encoder) + + sampling_result = sampler.generate_sequence( + batch, + model, + data_loader, + data_loader.max_event + + data.atomic_data.num_delimiter_tokens["category"], + data_loader.max_effect + - data.atomic_data.num_delimiter_tokens["category"], + ) + + sequence_all["beams"] = sampling_result["beams"] + + # print_atomic_sequence(sequence_all) + + return {category: sequence_all} + + +def print_atomic_sequence(sequence_object): + input_event = sequence_object["event"] + category = sequence_object["effect_type"] + + print("Input Event: {}".format(input_event)) + print("Target Effect: {}".format(category)) + print("") + print("Candidate Sequences:") + for beam in sequence_object["beams"]: + print(beam) + print("") + print("====================================================") + print("") + + +def set_atomic_inputs(input_event, category, data_loader, text_encoder): + XMB = torch.zeros(1, data_loader.max_event + 1).long().to(cfg.device) + prefix, suffix = data.atomic_data.do_example( + text_encoder, input_event, None, True, None + ) + + if len(prefix) > data_loader.max_event + 1: + prefix = prefix[: data_loader.max_event + 1] + + XMB[:, : len(prefix)] = torch.LongTensor(prefix) + XMB[:, -1] = torch.LongTensor([text_encoder.encoder["<{}>".format(category)]]) + + batch = {} + batch["sequences"] = XMB + batch["attention_mask"] = data.atomic_data.make_attention_mask(XMB) + + return batch + + +def get_conceptnet_sequence( + e1, model, sampler, data_loader, text_encoder, relation, force=False +): + if isinstance(relation, list): + outputs = {} + + for rel in relation: + new_outputs = get_conceptnet_sequence( + e1, model, sampler, data_loader, text_encoder, rel + ) + outputs.update(new_outputs) + return outputs + elif relation == "all": + outputs = {} + + for relation in data.conceptnet_data.conceptnet_relations: + new_outputs = get_conceptnet_sequence( + e1, model, sampler, data_loader, text_encoder, relation + ) + outputs.update(new_outputs) + return outputs + else: + sequence_all = {} + + sequence_all["e1"] = e1 + sequence_all["relation"] = relation + + with torch.no_grad(): + if data_loader.max_r != 1: + relation_sequence = data.conceptnet_data.split_into_words[relation] + else: + relation_sequence = "<{}>".format(relation) + + batch, abort = set_conceptnet_inputs( + e1, + relation_sequence, + text_encoder, + data_loader.max_e1, + data_loader.max_r, + force, + ) + + if abort: + return {relation: sequence_all} + + sampling_result = sampler.generate_sequence( + batch, + model, + data_loader, + data_loader.max_e1 + data_loader.max_r, + data_loader.max_e2, + ) + + sequence_all["beams"] = sampling_result["beams"] + + print_conceptnet_sequence(sequence_all) + + return {relation: sequence_all} + + +def set_conceptnet_inputs(input_event, relation, text_encoder, max_e1, max_r, force): + abort = False + + e1_tokens, rel_tokens, _ = data.conceptnet_data.do_example( + text_encoder, input_event, relation, None + ) + + if len(e1_tokens) > max_e1: + if force: + XMB = torch.zeros(1, len(e1_tokens) + max_r).long().to(cfg.device) + else: + XMB = torch.zeros(1, max_e1 + max_r).long().to(cfg.device) + return {}, True + else: + XMB = torch.zeros(1, max_e1 + max_r).long().to(cfg.device) + + XMB[:, : len(e1_tokens)] = torch.LongTensor(e1_tokens) + XMB[:, max_e1 : max_e1 + len(rel_tokens)] = torch.LongTensor(rel_tokens) + + batch = {} + batch["sequences"] = XMB + batch["attention_mask"] = data.conceptnet_data.make_attention_mask(XMB) + + return batch, abort + + +def print_conceptnet_sequence(sequence_object): + e1 = sequence_object["e1"] + relation = sequence_object["relation"] + + print("Input Entity: {}".format(e1)) + print("Target Relation: {}".format(relation)) + print("") + print("Candidate Sequences:") + for beam in sequence_object["beams"]: + print(beam) + print("") + print("====================================================") + print("") + + +def print_help(data): + print("") + if data == "atomic": + print('Provide a seed event such as "PersonX goes to the mall"') + print("Don't include names, instead replacing them with PersonX, PersonY, etc.") + print("The event should always have PersonX included") + if data == "conceptnet": + print('Provide a seed entity such as "go to the mall"') + print("Because the model was trained on lemmatized entities,") + print("it works best if the input entities are also lemmatized") + print("") + + +def print_relation_help(data): + print_category_help(data) + + +def print_category_help(data): + print("") + if data == "atomic": + print("Enter a possible effect type from the following effect types:") + print( + "all - compute the output for all effect types {{oEffect, oReact, oWant, xAttr, xEffect, xIntent, xNeed, xReact, xWant}}" + ) + print( + "oEffect - generate the effect of the event on participants other than PersonX" + ) + print( + "oReact - generate the reactions of participants other than PersonX to the event" + ) + print( + "oEffect - generate what participants other than PersonX may want after the event" + ) + elif data == "conceptnet": + print("Enter a possible relation from the following list:") + print("") + print("AtLocation") + print("CapableOf") + print("Causes") + print("CausesDesire") + print("CreatedBy") + print("DefinedAs") + print("DesireOf") + print("Desires") + print("HasA") + print("HasFirstSubevent") + print("HasLastSubevent") + print("HasPainCharacter") + print("HasPainIntensity") + print("HasPrerequisite") + print("HasProperty") + print("HasSubevent") + print("InheritsFrom") + print("InstanceOf") + print("IsA") + print("LocatedNear") + print("LocationOfAction") + print("MadeOf") + print("MotivatedByGoal") + print("NotCapableOf") + print("NotDesires") + print("NotHasA") + print("NotHasProperty") + print("NotIsA") + print("NotMadeOf") + print("PartOf") + print("ReceivesAction") + print("RelatedTo") + print("SymbolOf") + print("UsedFor") + print("") + print("NOTE: Capitalization is important") + else: + raise + print("") + + +def print_sampling_help(): + print("") + print( + "Provide a sampling algorithm to produce the sequence with from the following:" + ) + print("") + print("greedy") + print("beam-# where # is the beam size") + print("topk-# where # is k") + print("") diff --git a/Model/COSMIC/feature_extraction/comet/src/main.py b/Model/COSMIC/feature_extraction/comet/src/main.py new file mode 100644 index 0000000000000000000000000000000000000000..703cd3d05ca4e5e19db03a4bd1a309de0ac2c402 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/main.py @@ -0,0 +1,19 @@ +import sys +import os +import argparse + +sys.path.append(os.getcwd()) + +parser = argparse.ArgumentParser() +parser.add_argument("--experiment_type", type=str, default='atomic', + choices=["atomic", "conceptnet"]) +parser.add_argument("--experiment_num", type=str, default="0") + +args = parser.parse_args() + +if args.experiment_type == "atomic": + from main_atomic import main + main(args.experiment_num) +if args.experiment_type == "conceptnet": + from main_conceptnet import main + main(args.experiment_num) diff --git a/Model/COSMIC/feature_extraction/comet/src/main_atomic.py b/Model/COSMIC/feature_extraction/comet/src/main_atomic.py new file mode 100644 index 0000000000000000000000000000000000000000..cc6aacda3781a0b83ac9d1e51b3be4f0d3e14a35 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/main_atomic.py @@ -0,0 +1,125 @@ + +import random + +import torch + +import comet.src.train.atomic_train as train +import comet.src.models.models as models +import comet.src.data.data as data +import comet.utils.utils as utils +import comet.src.train.utils as train_utils +import comet.src.data.config as cfg + +from comet.src.data.utils import TextEncoder +from comet.src.train.opt import OpenAIAdam + + +def main(num): + # Generate configuration files depending on experiment being run + utils.generate_config_files("atomic", num) + + # Loads the correct configuration file + config_file = "config/atomic/config_{}.json".format(num) + + print(config_file) + + # Read config file to option + config = cfg.read_config(cfg.load_config(config_file)) + opt, meta = cfg.get_parameters(config) + + # Set the random seeds + torch.manual_seed(opt.train.static.seed) + random.seed(opt.train.static.seed) + if config.gpu_mode: + torch.cuda.manual_seed_all(opt.train.static.seed) + + # Where to find the data + splits = ["train", "dev", "test"] + + opt.train.dynamic.epoch = 0 + + print("Loading Data") + + categories = opt.data.categories + + path = "data/atomic/processed/{}/{}.pickle".format( + opt.exp, utils.make_name_string(opt.data)) + + data_loader = data.make_data_loader(opt, categories) + loaded = data_loader.load_data(path) + print(data_loader.sequences["train"]["total"].size(0)) + data_loader.opt = opt + data_loader.batch_size = opt.train.dynamic.bs + + print("Done.") + + # Initialize text_encoder + text_encoder = TextEncoder(config.encoder_path, config.bpe_path) + + special = [data.start_token, data.end_token] + special += ["<{}>".format(cat) for cat in categories] + special += [data.blank_token] + + text_encoder.encoder = data_loader.vocab_encoder + text_encoder.decoder = data_loader.vocab_decoder + + opt.data.maxe1 = data_loader.max_event + opt.data.maxe2 = data_loader.max_effect + opt.data.maxr = data.atomic_data.num_delimiter_tokens["category"] + + n_special = len(special) + n_ctx = opt.data.maxe1 + opt.data.maxe2 + n_vocab = len(text_encoder.encoder) + n_ctx + + print(data_loader.__dict__.keys()) + opt.net.vSize = n_vocab + + print("Building Model") + + model = models.make_model( + opt, n_vocab, n_ctx, n_special, + load=(opt.net.init=="pt")) + + print("Done.") + + print("Files will be logged at: {}".format( + utils.make_name(opt, prefix="results/losses/", + is_dir=True, eval_=True))) + + data_loader.reset_offsets("train") + + # Get number of examples + data.set_max_sizes(data_loader) + + if config.gpu_mode: + print("Pushing to GPU: {}".format(config.gpu_index)) + cfg.device = config.gpu_index + cfg.do_gpu = True + torch.cuda.set_device(cfg.device) + if config.multigpu: + model = models.multi_gpu( + model, config.gpu_indices).cuda() + else: + model.cuda(cfg.device) + print("Done.") + + print("Training") + + optimizer = OpenAIAdam(model.parameters(), + lr=opt.train.dynamic.lr, + schedule=opt.train.static.lrsched, + warmup=opt.train.static.lrwarm, + t_total=meta.iterations, + b1=opt.train.static.b1, + b2=opt.train.static.b2, + e=opt.train.static.e, + l2=opt.train.static.l2, + vector_l2=opt.train.static.vl2, + max_grad_norm=opt.train.static.clip) + + scorers = ["bleu", "rouge", "cider"] + trainer = train.make_trainer( + opt, meta, data_loader, model, optimizer) + trainer.set_evaluator(opt, model, data_loader) + + trainer.run() diff --git a/Model/COSMIC/feature_extraction/comet/src/main_conceptnet.py b/Model/COSMIC/feature_extraction/comet/src/main_conceptnet.py new file mode 100644 index 0000000000000000000000000000000000000000..baa252acaa90f719b9a631d0d51e0b4e3ecec0ed --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/main_conceptnet.py @@ -0,0 +1,138 @@ + +import random + +import torch + +import comet.src.train.conceptnet_train as train +import comet.src.models.models as models +import comet.src.data.data as data +import comet.utils.utils as utils +import comet.src.train.utils as train_utils +import comet.src.data.config as cfg + +from comet.src.data.utils import TextEncoder +from comet.src.train.opt import OpenAIAdam + + +def main(num): + # Generate configuration files depending on experiment being run + utils.generate_config_files("conceptnet", num) + + # Loads the correct configuration file + config_file = "config/conceptnet/config_{}.json".format(num) + + print(config_file) + + # Read config file to option + config = cfg.read_config(cfg.load_config(config_file)) + opt, meta = cfg.get_parameters(config) + + # config.gpu_mode = torch.cuda.is_available() + + # Set the random seeds + torch.manual_seed(opt.train.static.seed) + random.seed(opt.train.static.seed) + if config.gpu_mode: + torch.cuda.manual_seed_all(opt.train.static.seed) + + # Load the data + splits = ["train", "dev", "test"] + + opt.train.dynamic.epoch = 0 + + print("Loading Data") + + # Initialize path to pre-set data loader + path = "data/conceptnet/processed/{}/{}.pickle".format( + opt.exp, utils.make_name_string(opt.data)) + + # Make data loader + data_loader = data.make_data_loader(opt) + loaded = data_loader.load_data(path) + print(data_loader.sequences["train"]["total"].size(0)) + data_loader.opt = opt + data_loader.batch_size = opt.train.dynamic.bs + + print("Done.") + + text_encoder = TextEncoder(config.encoder_path, config.bpe_path) + + categories = data.conceptnet_data.conceptnet_relations + + special = [data.start_token, data.end_token] + special += ["<{}>".format(cat) for cat in categories] + + if loaded: + text_encoder.encoder = data_loader.vocab_encoder + text_encoder.decoder = data_loader.vocab_decoder + else: + for special_token in special: + text_encoder.decoder[len(encoder)] = special_token + text_encoder.encoder[special_token] = len(encoder) + data_loader.make_tensors(text_encoder, special) + + # Set max size of different parts of relation + context_size_e1 = data_loader.max_e1 + context_size_e2 = data_loader.max_e2 + context_size_r = data_loader.max_r + + opt.data.maxr = context_size_r + + n_special = len(special) + n_ctx = context_size_e1 + context_size_r + context_size_e2 + n_vocab = len(text_encoder.encoder) + n_ctx + + print(data_loader.__dict__.keys()) + opt.net.vSize = n_vocab + + # Build Model + print("Building Model") + + model = models.make_model( + opt, n_vocab, n_ctx, n_special, + load=(opt.net.init=="pt")) + + print("Done.") + + print("Files will be logged at: {}".format( + utils.make_name(opt, prefix="results/losses/", + is_dir=True, eval_=True))) + + data_loader.reset_offsets("train", keys=["total"]) + + data.set_max_sizes(data_loader) + + # Push to GPU + if config.gpu_mode: + print("Pushing to GPU: {}".format(config.gpu_index)) + cfg.device = config.gpu_index + cfg.do_gpu = True + torch.cuda.set_device(cfg.device) + if config.multigpu: + model = models.multi_gpu( + model, config.gpu_indices).cuda() + else: + model.cuda(cfg.device) + print("Done.") + + print("Training") + + optimizer = OpenAIAdam(model.parameters(), + lr=opt.train.dynamic.lr, + schedule=opt.train.static.lrsched, + warmup=opt.train.static.lrwarm, + t_total=meta.iterations, + b1=opt.train.static.b1, + b2=opt.train.static.b2, + e=opt.train.static.e, + l2=opt.train.static.l2, + vector_l2=opt.train.static.vl2, + max_grad_norm=opt.train.static.clip) + + trainer = train.make_trainer( + opt, meta, data_loader, model, optimizer) + print(data_loader.sequences["dev"]["total"].max()) + trainer.set_generator(opt, model, data_loader) + trainer.set_evaluator(opt, model, data_loader) + + trainer.run() diff --git a/Model/COSMIC/feature_extraction/comet/src/models/gpt.py b/Model/COSMIC/feature_extraction/comet/src/models/gpt.py new file mode 100644 index 0000000000000000000000000000000000000000..22f5eda7f130f27c80aed1ff72bd1d62aacfbdee --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/models/gpt.py @@ -0,0 +1,311 @@ +import copy +import json +import math +import re + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.parameter import Parameter + + +''' +Much of this code is taken from HuggingFace's OpenAI LM Implementation here: + +https://github.com/huggingface/pytorch-openai-transformer-lm +''' + + +def gelu(x): + return (0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * + (x + 0.044715 * torch.pow(x, 3))))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT_FNS = { + 'relu': nn.ReLU, + 'swish': swish, + 'gelu': gelu +} + + +class LayerNorm(nn.Module): + "Construct a layernorm module in the OpenAI style \ + (epsilon inside the square root)." + + def __init__(self, n_state, e=1e-5): + super(LayerNorm, self).__init__() + self.g = nn.Parameter(torch.ones(n_state)) + self.b = nn.Parameter(torch.zeros(n_state)) + self.e = e + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.e) + return self.g * x + self.b + + +class Conv1D(nn.Module): + def __init__(self, nf, rf, nx): + super(Conv1D, self).__init__() + self.rf = rf + self.nf = nf + if rf == 1: # faster 1x1 conv + w = torch.empty(nx, nf) + nn.init.normal_(w, std=0.02) + self.w = Parameter(w) + self.b = Parameter(torch.zeros(nf)) + else: # was used to train LM + raise NotImplementedError + + def forward(self, x): + if self.rf == 1: + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w) + x = x.view(*size_out) + else: + raise NotImplementedError + return x + + +class Attention(nn.Module): + def __init__(self, nx, n_ctx, cfg, scale=False): + super(Attention, self).__init__() + n_state = nx # in Attention: n_state=768 (nx=n_embd) + + assert n_state % cfg.nH == 0 + self.register_buffer('b', torch.tril(torch.ones( + n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.n_head = cfg.nH + self.split_size = n_state + self.scale = scale + self.c_attn = Conv1D(n_state * 3, 1, nx) + self.c_proj = Conv1D(n_state, 1, nx) + self.attn_dropout = nn.Dropout(cfg.adpt) + self.resid_dropout = nn.Dropout(cfg.rdpt) + + # dimensions of w: (batch_size x num_heads x seq_length x seq_length) + def _attn(self, q, k, v, sequence_mask): + w = torch.matmul(q, k) + if self.scale: + w = w / math.sqrt(v.size(-1)) + + b_subset = self.b[:, :, :w.size(-2), :w.size(-1)] + + if sequence_mask is not None: + b_subset = b_subset * sequence_mask.view( + sequence_mask.size(0), 1, -1) + b_subset = b_subset.permute(1, 0, 2, 3) + + w = w * b_subset + -1e9 * (1 - b_subset) + w = nn.Softmax(dim=-1)(w) + w = self.attn_dropout(w) + return torch.matmul(w, v) + + def merge_heads(self, x): + x = x.permute(0, 2, 1, 3).contiguous() + new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) + return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states + + def split_heads(self, x, k=False): + new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) + x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states + if k: + return x.permute(0, 2, 3, 1) + else: + return x.permute(0, 2, 1, 3) + + def forward(self, x, sequence_mask): + x = self.c_attn(x) + query, key, value = x.split(self.split_size, dim=2) + query = self.split_heads(query) + key = self.split_heads(key, k=True) + value = self.split_heads(value) + a = self._attn(query, key, value, sequence_mask) + a = self.merge_heads(a) + a = self.c_proj(a) + a = self.resid_dropout(a) + return a + + +class MLP(nn.Module): + def __init__(self, n_state, cfg): # in MLP: n_state=3072 (4 * n_embd) + super(MLP, self).__init__() + nx = cfg.hSize + self.c_fc = Conv1D(n_state, 1, nx) + self.c_proj = Conv1D(nx, 1, n_state) + self.act = ACT_FNS[cfg.afn] + self.dropout = nn.Dropout(cfg.rdpt) + + def forward(self, x): + h = self.act(self.c_fc(x)) + h2 = self.c_proj(h) + return self.dropout(h2) + + +class Block(nn.Module): + def __init__(self, n_ctx, cfg, scale=False): + super(Block, self).__init__() + nx = cfg.hSize + self.attn = Attention(nx, n_ctx, cfg, scale) + self.ln_1 = LayerNorm(nx) + self.mlp = MLP(4 * nx, cfg) + self.ln_2 = LayerNorm(nx) + + def forward(self, x, sequence_mask): + a = self.attn(x, sequence_mask) + n = self.ln_1(x + a) + m = self.mlp(n) + h = self.ln_2(n + m) + return h + + +class TransformerModel(nn.Module): + """ Transformer model """ + + def __init__(self, cfg, vocab=40990, n_ctx=512): + super(TransformerModel, self).__init__() + self.vocab = vocab + self.embed = nn.Embedding(vocab, cfg.hSize) + self.drop = nn.Dropout(cfg.edpt) + block = Block(n_ctx, cfg, scale=True) + self.h = nn.ModuleList([copy.deepcopy(block) + for _ in range(cfg.nL)]) + + nn.init.normal_(self.embed.weight, std=0.02) + + def forward(self, x, sequence_mask): + x = x.view(-1, x.size(-2), x.size(-1)) + e = self.embed(x) + # Add the position information to the input embeddings + h = e.sum(dim=2) + for block in self.h: + h = block(h, sequence_mask) + return h + + +class LMModel(nn.Module): + """ Transformer with language model head only """ + def __init__(self, cfg, vocab=40990, n_ctx=512, + return_probs=False, return_acts=False): + super(LMModel, self).__init__() + self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx) + self.lm_head = LMHead(self.transformer, cfg, trunc_and_reshape=False) + self.return_probs = return_probs + self.return_acts = return_acts + if self.return_probs or self.return_acts: + pos_emb_mask = torch.zeros(1, 1, vocab) + pos_emb_mask[:, :, -n_ctx:] = -1e12 + self.register_buffer('pos_emb_mask', pos_emb_mask) + + def forward(self, x, sequence_mask=None): + h = self.transformer(x, sequence_mask) + lm_logits = self.lm_head(h) + if self.return_probs: + lm_logits = F.softmax(lm_logits + self.pos_emb_mask, dim=-1) + elif self.return_acts: + lm_logits = lm_logits + self.pos_emb_mask + return h, lm_logits + + +class LMHead(nn.Module): + """ Language Model Head for the transformer """ + + def __init__(self, model, cfg, trunc_and_reshape=True): + super(LMHead, self).__init__() + self.n_embd = cfg.hSize + embed_shape = model.embed.weight.shape + self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False) + self.decoder.weight = model.embed.weight # Tied weights + self.trunc_and_reshape = trunc_and_reshape # XD + + def forward(self, h): + # Truncated Language modeling logits (we remove the last token) + h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd) \ + if self.trunc_and_reshape else h # XD + lm_logits = self.decoder(h_trunc) + return lm_logits + + +def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, + n_embd=768, path='./model/', path_names='./'): + # Load weights from TF model + print("Loading weights...") + names = json.load(open(path_names + 'parameters_names.json')) + shapes = json.load(open(path + 'params_shapes.json')) + offsets = np.cumsum([np.prod(shape) for shape in shapes]) + init_params = [np.load(path + 'params_{}.npy'.format(n)) for n in range(10)] + init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] + init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] + if n_ctx > 0: + init_params[0] = init_params[0][:n_ctx] + if n_special > 0: + init_params[0] = np.concatenate( + [init_params[1], + (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32), + init_params[0] + ], 0) + else: + init_params[0] = np.concatenate( + [init_params[1], + init_params[0] + ], 0) + del init_params[1] + if n_transfer == -1: + n_transfer = 0 + else: + n_transfer = 1 + n_transfer * 12 + init_params = [arr.squeeze() for arr in init_params] + + try: + assert model.embed.weight.shape == init_params[0].shape + except AssertionError as e: + e.args += (model.embed.weight.shape, init_params[0].shape) + raise + + model.embed.weight.data = torch.from_numpy(init_params[0]) + + for name, ip in zip(names[1:n_transfer], init_params[1:n_transfer]): + name = name[6:] # skip "model/" + assert name[-2:] == ":0" + name = name[:-2] + name = name.split('/') + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+\d+', m_name): + l = re.split(r'(\d+)', m_name) + else: + l = [m_name] + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + try: + assert pointer.shape == ip.shape + except AssertionError as e: + e.args += (pointer.shape, ip.shape) + raise + pointer.data = torch.from_numpy(ip) + + +class dotdict(dict): + """dot.notation access to dictionary attributes""" + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + +DEFAULT_CONFIG = dotdict({ + 'n_embd': 768, + 'n_head': 12, + 'n_layer': 12, + 'embd_pdrop': 0.1, + 'attn_pdrop': 0.1, + 'resid_pdrop': 0.1, + 'afn': 'gelu', + 'clf_pdrop': 0.1}) diff --git a/Model/COSMIC/feature_extraction/comet/src/models/models.py b/Model/COSMIC/feature_extraction/comet/src/models/models.py new file mode 100644 index 0000000000000000000000000000000000000000..89600130ef48664e1705b861930cfb4574c54a25 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/models/models.py @@ -0,0 +1,32 @@ +from comet.src.models.gpt import (LMModel, DEFAULT_CONFIG, load_openai_pretrained_model) +import torch.nn as nn + + +def make_model(opt, n_vocab, n_ctx, n_special, load=True, + return_acts=True, return_probs=False, + clf_token="", answer_size=None): + print(n_ctx) + if opt.exp == "generation": + model = LMModel( + opt.net, n_vocab, n_ctx, return_acts=return_acts, + return_probs=return_probs) + elif opt.exp == "classification": + model = ClfModel( + opt.net, n_vocab, n_ctx, clf_token, answer_size) + if load: + print("LOADING PRETRAINED TRANSFORMER") + load_openai_pretrained_model( + model.transformer, n_ctx=n_ctx, n_special=n_special) + return model + + +def multi_gpu(model, devices): + return nn.DataParallel(model, device_ids=devices) + + +def load_state_dict(model, state_dict): + try: + model.load_state_dict(state_dict) + except RuntimeError: + new_state_dict = {i[len("module."):]: j for i, j in state_dict.items()} + model.load_state_dict(new_state_dict) diff --git a/Model/COSMIC/feature_extraction/comet/src/models/utils.py b/Model/COSMIC/feature_extraction/comet/src/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d83ffe40d3fb814dcdfc379c645cf153087d7e96 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/models/utils.py @@ -0,0 +1,12 @@ +import torch + + +def prepare_position_embeddings(opt, encoder_vocab, sequences): + vocab_size = len(encoder_vocab) + num_positions = sequences.size(-2) + position_embeddings = torch.LongTensor( + range(vocab_size, vocab_size + num_positions)).to(sequences.device) + sequences = sequences.repeat(1, 1, 2) + sequences[:, :, 1] = position_embeddings + return sequences + diff --git a/Model/COSMIC/feature_extraction/comet/src/train/atomic_train.py b/Model/COSMIC/feature_extraction/comet/src/train/atomic_train.py new file mode 100644 index 0000000000000000000000000000000000000000..f1351774d42d2a1d55a06e4412f7c434c252c982 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/train/atomic_train.py @@ -0,0 +1,76 @@ +import random + +import comet.src.train.train as base_train +import comet.src.train.batch as batch +import comet.src.evaluate.atomic_evaluate as evaluate +# import comet.src.evaluate.atomic_generate as gen + + +def make_trainer(opt, *args): + return AtomicGenerationIteratorTrainer(opt, *args) + + +class AtomicGenerationIteratorTrainer(base_train.IteratorTrainer): + def __init__(self, opt, *args): + super(AtomicGenerationIteratorTrainer, self).__init__(opt, *args) + + self.initialize_losses(opt.data.get("categories", [])) + + def set_evaluator(self, opt, model, data_loader): + self.evaluator = evaluate.make_evaluator( + opt, model, data_loader) + + # def set_generator(self, opt, model, data_loader, scores, reward=None): + # self.generator = gen.make_generator( + # opt, model, data_loader, scores, reward) + + def set_sampler(self, opt): + if opt.train.static.samp not in self.samplers: + self.samplers[opt.train.static.samp] = sampling.make_sampler( + opt.train.static.samp, opt, self.data_loader, batch_mode=True) + self.batch_variables["sampler"] = self.samplers + + def batch(self, opt, *args): + outputs = batch.batch_atomic_generate(opt, *args) + + token_loss = outputs["loss"] + nums = outputs["nums"] + reset = outputs["reset"] + + return token_loss, nums, reset + + def initialize_losses(self, categories): + self.losses["train"] = { + "total_micro": [0], + "total_macro": [0] + } + + nums = {"total_micro": 0, "total_macro": 0} + + for category in categories: + micro_name = "{}_micro".format(category) + macro_name = "{}_macro".format(category) + + self.losses["train"][micro_name] = [0] + self.losses["train"][macro_name] = [0] + + nums[micro_name] = 0 + nums[macro_name] = 0 + + return nums + + def update_top_score(self, opt): + print(self.top_score) + if self.top_score is None: + self.top_score = (self.opt.train.dynamic.epoch, + self.get_tracked_score()) + elif self.get_tracked_score() < self.top_score[-1]: + self.top_score = (self.opt.train.dynamic.epoch, + self.get_tracked_score()) + print(self.top_score) + + def get_tracked_score(self): + return self.losses["dev"]["total_micro"][self.opt.train.dynamic.epoch] + + def counter(self, nums): + return nums["total_macro"] diff --git a/Model/COSMIC/feature_extraction/comet/src/train/batch.py b/Model/COSMIC/feature_extraction/comet/src/train/batch.py new file mode 100644 index 0000000000000000000000000000000000000000..4f90de737edc823148148c791a1dff383aa5428f --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/train/batch.py @@ -0,0 +1,135 @@ + +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comet.src.data.config as cfg +import comet.src.train.utils as train_utils +import comet.src.models.utils as model_utils +import comet.src.evaluate.utils as eval_utils +import comet.utils.utils as utils +from IPython import embed + + +############################################################################## +# BATCH +############################################################################## + + +def batch_atomic_generate(opt, nums, losses, batch_variables, eval_mode=False): + data_loader = batch_variables["data"] + model = batch_variables["model"] + split = batch_variables["split"] + + batch, reset = data_loader.sample_batch(split, bs=opt.train.dynamic.bs) + + input_ = model_utils.prepare_position_embeddings( + opt, data_loader.vocab_encoder, batch["sequences"].unsqueeze(-1)) + attention_mask = batch["attention_mask"] + loss_mask = batch["loss_mask"] + + targets = input_.squeeze(0)[:, 1:, 0].contiguous().view(-1) + + loss, dist = mle_steps( + opt.net.model, model, input_[:, :-1, :], targets, + attention_mask[:, :-1], loss_reduction="none") + + # Set loss name + micro_name = "total_micro" + macro_name = "total_macro" + + length = loss_mask.sum(1) + bs = input_.size(0) + + final_loss = (loss * loss_mask).sum(1) + + update_generation_losses(losses, nums, micro_name, macro_name, bs, + length, (loss * loss_mask).sum(1), split) + + final_loss = final_loss / length + + outputs = {"loss": final_loss.sum(), "nums": nums, "reset": reset} + + return outputs + + +def batch_conceptnet_generate(opt, nums, losses, batch_variables, + eval_mode=False, tracking_mode=False): + data_loader = batch_variables["data"] + model = batch_variables["model"] + split = batch_variables["split"] + category = batch_variables["category"] + + batch, reset = data_loader.sample_batch( + split, bs=opt.train.dynamic.bs, cat=category) + + input_ = model_utils.prepare_position_embeddings( + opt, data_loader.vocab_encoder, batch["sequences"].unsqueeze(-1)) + attention_mask = batch["attention_mask"] + loss_mask = batch["loss_mask"] + + targets = input_.squeeze(0)[:, 1:, 0].contiguous().view(-1) + + loss, dist = mle_steps( + opt.net.model, model, input_[:, :-1, :], targets, + attention_mask[:, :-1], loss_reduction="none") + + # Set loss name + if not eval_mode or batch_variables["category"] == "positive": + micro_name = "total_micro" + macro_name = "total_macro" + else: + micro_name = "negative_micro" + macro_name = "negative_macro" + + length = loss_mask.sum(1) + bs = input_.size(0) + + final_loss = (loss * loss_mask).sum(1) + + update_generation_losses(losses, nums, micro_name, macro_name, bs, + length, (loss * loss_mask).sum(1), split) + + final_loss = final_loss / length + + outputs = {"loss": final_loss.sum(), "nums": nums, "reset": reset} + + if tracking_mode: + outputs["tracking"] = final_loss.squeeze().tolist() + + return outputs + + +def mle_steps(key, model, input_, targets, attention_mask, + loss_reduction="mean", i=None): + word_acts = decode(model, input_.unsqueeze(1), + attention_mask, i) + + word_dist = train_utils.modify_output_for_loss_fn( + "nll", word_acts, dim=-1) + + # Compute losses + loss = F.nll_loss( + word_dist.view(-1, word_dist.size(-1)), + targets, reduction=loss_reduction) + + if loss_reduction != "mean": + return loss.view(word_dist.size(0), -1), word_dist + else: + return loss, word_dist + + +def decode(model, input_, attention_mask, i=None): + return model(input_, sequence_mask=attention_mask) + + +def update_generation_losses(losses, nums, micro, macro, bs, + length, loss, split): + if split == "train": + train_utils.update_generation_losses( + losses, nums, micro, macro, bs, length, loss) + else: + eval_utils.update_generation_losses( + losses, nums, micro, macro, bs, length, loss) diff --git a/Model/COSMIC/feature_extraction/comet/src/train/conceptnet_train.py b/Model/COSMIC/feature_extraction/comet/src/train/conceptnet_train.py new file mode 100644 index 0000000000000000000000000000000000000000..422dff8059572142e9b797b40c32b6f9e2b06414 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/train/conceptnet_train.py @@ -0,0 +1,67 @@ +import random +import torch + +import comet.src.data.config as cfg + +import comet.src.train.atomic_train as base_train +import comet.src.train.batch as batch_utils +import comet.src.evaluate.conceptnet_evaluate as evaluate +import comet.src.evaluate.conceptnet_generate as gen + + +def make_trainer(opt, *args): + return ConceptNetGenerationIteratorTrainer(opt, *args) + + +class ConceptNetGenerationIteratorTrainer( + base_train.AtomicGenerationIteratorTrainer): + def set_evaluator(self, opt, model, data_loader): + self.evaluator = evaluate.make_evaluator( + opt, model, data_loader) + + def set_generator(self, opt, model, data_loader): + self.generator = gen.make_generator( + opt, model, data_loader) + + def batch(self, opt, *args): + outputs = batch_utils.batch_atomic_generate(opt, *args) + + token_loss = outputs["loss"] + nums = outputs["nums"] + reset = outputs["reset"] + + return token_loss, nums, reset + + def update_top_score(self, opt): + print(self.top_score) + + tracked_scores = self.get_tracked_score() + + if self.top_score is None: + self.top_score = \ + self.top_score = {"epoch": {}, "score": {}} + self.top_score["epoch"]["total_micro"] = self.opt.train.dynamic.epoch + self.top_score["score"]["total_micro"] = tracked_scores["total_micro"] + else: + if tracked_scores["total_micro"] < self.top_score["score"]["total_micro"]: + self.top_score["epoch"]["total_micro"] = self.opt.train.dynamic.epoch + self.top_score["score"]["total_micro"] = tracked_scores["total_micro"] + + print(self.top_score) + + def get_tracked_score(self): + return { + "total_micro": self.losses["dev"]["total_micro"][self.opt.train.dynamic.epoch] + } + + def decide_to_save(self): + to_save = cfg.save and not cfg.toy + + curr_epoch = self.opt.train.dynamic.epoch + + to_save = to_save or cfg.test_save + print(cfg.save_strategy) + if cfg.save_strategy == "best": + if ((self.top_score["epoch"]["total_micro"] != curr_epoch)): + to_save = False + return to_save diff --git a/Model/COSMIC/feature_extraction/comet/src/train/opt.py b/Model/COSMIC/feature_extraction/comet/src/train/opt.py new file mode 100644 index 0000000000000000000000000000000000000000..ef8b9666a41a52f775dfa38797648aa0553af2e0 --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/train/opt.py @@ -0,0 +1,122 @@ +'''TAKEN from OpenAI LM Code by HuggingFace''' + +import math +import torch +from torch.optim import Optimizer +from torch.nn.utils import clip_grad_norm_ + + +def warmup_cosine(x, warmup=0.002): + s = 1 if x <= warmup else 0 + return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x))) + + +def warmup_constant(x, warmup=0.002): + s = 1 if x <= warmup else 0 + return s*(x/warmup) + (1-s)*1 + + +def warmup_linear(x, warmup=0.002): + s = 1 if x <= warmup else 0 + + # print(s) + + return (s*(x/warmup) + (1-s))*(1-x) + + +SCHEDULES = { + 'warmup_cosine': warmup_cosine, + 'warmup_constant': warmup_constant, + 'warmup_linear': warmup_linear, +} + + +class OpenAIAdam(Optimizer): + """Implements Open AI version of Adam algorithm with weight decay fix. + """ + def __init__(self, params, lr, schedule, warmup, t_total, + b1=0.9, b2=0.999, e=1e-8, l2=0, + vector_l2=False, max_grad_norm=-1, **kwargs): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0 <= warmup: + raise ValueError("Invalid warmup: {}".format(warmup)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {}".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {}".format(b2)) + if not 0.0 <= e: + raise ValueError("Invalid epsilon value: {}".format(e)) + defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, + b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2, + max_grad_norm=max_grad_norm) + super(OpenAIAdam, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + # print(group['t_total']) + # print(group['warmup']) + # if self.state[group['params'][0]]: + # print(self.state[group['params'][0]]['step'] / group['t_total']) + # print() + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError( + 'Adam does not support sparse gradients, \ + please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['b1'], group['b2'] + + state['step'] += 1 + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + denom = exp_avg_sq.sqrt().add_(group['e']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + schedule_fct = SCHEDULES[group['schedule']] + lr_scheduled = (group['lr'] * schedule_fct(state['step'] / + group['t_total'], group['warmup'])) + step_size = (lr_scheduled * math.sqrt(bias_correction2) / + bias_correction1) + + p.data.addcdiv_(-step_size, exp_avg, denom) + + # Add weight decay at the end (fixed version) + if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0: + p.data.add_(-lr_scheduled * group['l2'], p.data) + + return loss diff --git a/Model/COSMIC/feature_extraction/comet/src/train/train.py b/Model/COSMIC/feature_extraction/comet/src/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..7b050bd0d39ab0d72adfffbda72bb73f2eae247a --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/train/train.py @@ -0,0 +1,233 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comet.src.data.config as cfg +import comet.src.data.data as data +import comet.src.train.utils as train_utils +import comet.src.train.batch as batch + +import comet.src.evaluate.evaluate as evaluate +import comet.src.evaluate.generate as gen +import comet.src.evaluate.sampler as sampling + +import comet.utils.utils as utils + +from tensorboardX import SummaryWriter + + +class Trainer(object): + def __init__(self, opt, meta, data_loader, model, optimizer): + self.optimizer = optimizer + + self.model = model + + if opt.trainer == "epoch": + self.epochs = meta.epochs + self.data_loader = data_loader + self.opt = opt + + self.losses = {"dev": {}, "test": {}, "train": {}} + + self.top_score = None + + self.lrs = {} + + self.batch_variables = { + "data": self.data_loader, + "model": self.model, + "split": "train" + } + + self.do_gen = cfg.do_gen + self.samplers = {} + + def decide_to_save(self): + to_save = cfg.save and not cfg.toy + + to_save = to_save or cfg.test_save + print(cfg.save_strategy) + if cfg.save_strategy == "best": + if self.top_score[0] != self.opt.train.dynamic.epoch: + print("DOING IT RIGHT") + to_save = False + return to_save + + def save_model(self, tracked_score): + lrs = {} + for i, param_group in enumerate(self.optimizer.param_groups): + lrs[i] = param_group['lr'] + self.lrs[self.opt.train.dynamic.epoch] = lrs + + to_save = self.decide_to_save() + + if to_save: + data.save_step( + self.model, self.data_loader.vocab_encoder, + self.optimizer, self.opt, + self.opt.train.dynamic.epoch, self.lrs) + + def log_losses(self, opt, losses): + if (not cfg.toy and cfg.save) or cfg.test_save: + data.save_eval_file(opt, losses["train"], "losses", split="train") + data.save_eval_file(opt, losses['dev'], "losses", split="dev") + data.save_eval_file(opt, losses['test'], "losses", split="test") + + def set_logger(self): + if cfg.toy: + self.logger = SummaryWriter(utils.make_name( + self.opt, prefix="garbage/logs/", eval_=True, do_epoch=False)) + else: + self.logger = SummaryWriter(utils.make_name( + self.opt, prefix="logs/", eval_=True, do_epoch=False)) + print("Logging Tensorboard Files at: {}".format(self.logger.logdir)) + + def stop_logger(self): + self.logger.close() + + def run(self): + self.set_logger() + self.count = 0 + for epoch in range(self.epochs): + self.model.train() + self.opt.train.dynamic.epoch += 1 + self.epoch() + + self.stop_logger() + + def epoch(self): + nums = self.reset_losses() + + # Initialize progress bar + bar = utils.initialize_progress_bar( + self.data_loader.sequences["train"]) + + reset = False + + while not reset: + loss, nums, reset = self.do_forward_pass(nums) + self.do_backward_pass(loss) + self.update_parameters() + + bar.update(self.opt.train.dynamic.bs) + self.count += 1 + + for loss_name in self.losses["train"]: + self.logger.add_scalar( + "train/{}".format(loss_name), + loss.item() / self.opt.train.dynamic.bs, + self.count) + + if cfg.toy and self.counter(nums) > 300: + break + + with torch.no_grad(): + self.run_evaluation_cycle() + + self.log_losses(self.opt, self.losses) + self.update_top_score(self.opt) + self.save_model(self.get_tracked_score()) + + self.data_loader.reset_offsets("train") + + def run_evaluation_cycle(self): + for split in ["dev", "test"]: + self.evaluator.validate( + self.opt.train.dynamic.epoch, split, + self.losses[split]) + + if self.do_gen: + gen.do_gen_run( + self.opt, self.generator, self.opt.train.dynamic.epoch, + split, self.losses[split]) + iter_num = self.opt.train.dynamic.epoch + + for loss_name in self.losses[split]: + self.logger.add_scalar( + "{}/{}".format(split, loss_name), + self.losses[split][loss_name][iter_num], + iter_num) + + def clip_gradients(self): + if self.opt.train.static.clip: + torch.nn.utils.clip_grad_norm_( + self.model.parameters(), self.opt.train.static.clip) + + def do_forward_pass(self, nums): + token_loss, nums, reset = self.batch( + self.opt, nums, self.losses["train"], + self.batch_variables) + return token_loss, nums, reset + + def do_backward_pass(self, loss): + loss.backward() + + def update_parameters(self): + if self.opt.model == "lstm": + self.clip_gradients() + self.optimizer.step() + self.optimizer.zero_grad() + + def reset_losses(self): + loss_names = set([i.rstrip("maicro").rstrip("_") for + i in self.losses["train"].keys()]) + return self.initialize_losses(list(loss_names)) + + +class IteratorTrainer(Trainer): + def __init__(self, opt, meta, data_loader, model, optimizer): + super(IteratorTrainer, self).__init__( + opt, meta, data_loader, model, optimizer) + + self.iters = meta.cycle + self.total_iters = meta.iterations + + def run(self): + self.set_logger() + + # Initialize progress bar + bar = utils.set_progress_bar(self.total_iters) + + for cycle_num in range(int(self.total_iters / self.iters)): + self.model.train() + + self.cycle(bar, cycle_num) + + with torch.no_grad(): + self.run_evaluation_cycle() + + self.log_losses(self.opt, self.losses) + self.update_top_score(self.opt) + self.save_model(self.get_tracked_score()) + + self.stop_logger() + + def cycle(self, bar, cycle_num): + nums = self.reset_losses() + print(self.losses["train"]) + + for i in range(1, self.iters + 1): + # self.model.zero_grad() + + loss, nums, reset = self.do_forward_pass(nums) + self.do_backward_pass(loss) + + self.update_parameters() + # print(loss) + # print(loss.item()) + self.opt.train.dynamic.epoch += 1 + + for loss_name in self.losses["train"]: + self.logger.add_scalar( + "train/{}".format(loss_name), + loss.item() / self.opt.train.dynamic.bs, + self.opt.train.dynamic.epoch) + + bar.update(1) + + if cfg.toy and i > 10: + break + + if reset: + self.data_loader.reset_offsets("train") + diff --git a/Model/COSMIC/feature_extraction/comet/src/train/utils.py b/Model/COSMIC/feature_extraction/comet/src/train/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1fc4780c4391f8dcd0d78a8b633a6ae51fec1a1f --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/src/train/utils.py @@ -0,0 +1,58 @@ +import torch +import torch.optim +import torch.nn.functional as F + +import copy + + +def update_generation_losses(losses, nums, micro, macro, bs, length, loss): + # Update Losses + losses[micro] += \ + [copy.deepcopy(losses[micro][-1])] + losses[macro] += \ + [copy.deepcopy(losses[macro][-1])] + + losses[micro][-1] *= nums[micro] + losses[macro][-1] *= nums[macro] + + nums[macro] += bs + + if isinstance(length, int): + update_indiv_generation_losses( + losses, nums, micro, macro, bs, length, loss) + else: + update_tensor_generation_losses( + losses, nums, micro, macro, bs, length, loss) + + +def update_indiv_generation_losses(losses, nums, micro, + macro, bs, length, loss): + nums[micro] += (bs * length) + + batch_loss = loss * bs + + losses[micro][-1] += batch_loss + losses[micro][-1] /= nums[micro] + losses[macro][-1] += batch_loss / length + losses[macro][-1] /= nums[macro] + + +def update_tensor_generation_losses(losses, nums, micro, + macro, bs, length, loss): + nums[micro] += length.sum().item() + + losses[micro][-1] += loss.sum().item() + losses[micro][-1] /= nums[micro] + losses[macro][-1] += (loss / length.float()).sum().item() + losses[macro][-1] /= nums[macro] + + +def modify_output_for_loss_fn(loss_fn, output, dim): + if loss_fn == "ce": + return output + if loss_fn == "mse": + return F.softmax(output, dim=dim) + if loss_fn == "nll": + return F.log_softmax(output, dim=dim) + if loss_fn in ["bce", "wbce", "wbce1"]: + return torch.sigmoid(output) diff --git a/Model/COSMIC/feature_extraction/comet/utils/__init__.py b/Model/COSMIC/feature_extraction/comet/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Model/COSMIC/feature_extraction/comet/utils/utils.py b/Model/COSMIC/feature_extraction/comet/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..459c80a583446f45f8a4caf91dadc3cc39e2ba8b --- /dev/null +++ b/Model/COSMIC/feature_extraction/comet/utils/utils.py @@ -0,0 +1,210 @@ +import json +import copy + +import torch + +import numpy as np +import contextlib + +from distutils.dir_util import mkpath + +from tqdm import tqdm + + +def make_new_tensor_from_list(items, device_num, dtype=torch.float32): + if device_num is not None: + device = torch.device("cuda:{}".format(device_num)) + else: + device = torch.device("cpu") + return torch.tensor(items, dtype=dtype, device=device) + + +# is_dir look ast at whether the name we make +# should be a directory or a filename +def make_name(opt, prefix="", eval_=False, is_dir=True, set_epoch=None, + do_epoch=True): + string = prefix + string += "{}-{}".format(opt.dataset, opt.exp) + string += "/" + string += "{}-{}-{}".format(opt.trainer, opt.cycle, opt.iters) + string += "/" + string += opt.model + if opt.mle: + string += "-{}".format(opt.mle) + string += "/" + string += make_name_string(opt.data) + "/" + + string += make_name_string(opt.net) + "/" + string += make_name_string(opt.train.static) + "/" + + if eval_: + string += make_name_string(opt.eval) + "/" + # mkpath caches whether a directory has been created + # In IPython, this can be a problem if the kernel is + # not reset after a dir is deleted. Trying to recreate + # that dir will be a problem because mkpath will think + # the directory already exists + if not is_dir: + mkpath(string) + string += make_name_string( + opt.train.dynamic, True, do_epoch, set_epoch) + if is_dir: + mkpath(string) + + return string + + +def make_name_string(dict_, final=False, do_epoch=False, set_epoch=None): + if final: + if not do_epoch: + string = "{}_{}_{}".format( + dict_.lr, dict_.optim, dict_.bs) + elif set_epoch is not None: + string = "{}_{}_{}_{}".format( + dict_.lr, dict_.optim, dict_.bs, set_epoch) + else: + string = "{}_{}_{}_{}".format( + dict_.lr, dict_.optim, dict_.bs, dict_.epoch) + + return string + + string = "" + + for k, v in dict_.items(): + if type(v) == DD: + continue + if isinstance(v, list): + val = "#".join(is_bool(str(vv)) for vv in v) + else: + val = is_bool(v) + if string: + string += "-" + string += "{}_{}".format(k, val) + + return string + + +def is_bool(v): + if str(v) == "False": + return "F" + elif str(v) == "True": + return "T" + return v + + +def generate_config_files(type_, key, name="base", eval_mode=False): + with open("config/default.json".format(type_), "r") as f: + base_config = json.load(f) + with open("config/{}/default.json".format(type_), "r") as f: + base_config_2 = json.load(f) + if eval_mode: + with open("config/{}/eval_changes.json".format(type_), "r") as f: + changes_by_machine = json.load(f) + else: + with open("config/{}/changes.json".format(type_), "r") as f: + changes_by_machine = json.load(f) + + base_config.update(base_config_2) + + if name in changes_by_machine: + changes = changes_by_machine[name] + else: + changes = changes_by_machine["base"] + + # for param in changes[key]: + # base_config[param] = changes[key][param] + + replace_params(base_config, changes[key]) + + mkpath("config/{}".format(type_)) + + with open("config/{}/config_{}.json".format(type_, key), "w") as f: + json.dump(base_config, f, indent=4) + + +def replace_params(base_config, changes): + for param, value in changes.items(): + if isinstance(value, dict) and param in base_config: + replace_params(base_config[param], changes[param]) + else: + base_config[param] = value + + +def initialize_progress_bar(data_loader_list): + num_examples = sum([len(tensor) for tensor in + data_loader_list.values()]) + return set_progress_bar(num_examples) + + +def set_progress_bar(num_examples): + bar = tqdm(total=num_examples) + bar.update(0) + return bar + + +def merge_list_of_dicts(L): + result = {} + for d in L: + result.update(d) + return result + + +def return_iterator_by_type(data_type): + if isinstance(data_type, dict): + iterator = data_type.items() + else: + iterator = enumerate(data_type) + return iterator + + +@contextlib.contextmanager +def temp_seed(seed): + state = np.random.get_state() + np.random.seed(seed) + try: + yield + finally: + np.random.set_state(state) + + +def flatten(outer): + return [el for inner in outer for el in inner] + + +def zipped_flatten(outer): + return [(key, fill, el) for key, fill, inner in outer for el in inner] + + +def remove_none(l): + return [e for e in l if e is not None] + + +# Taken from Jobman 0.1 +class DD(dict): + def __getattr__(self, attr): + if attr == '__getstate__': + return super(DD, self).__getstate__ + elif attr == '__setstate__': + return super(DD, self).__setstate__ + elif attr == '__slots__': + return super(DD, self).__slots__ + return self[attr] + + def __setattr__(self, attr, value): + # Safety check to ensure consistent behavior with __getattr__. + assert attr not in ('__getstate__', '__setstate__', '__slots__') +# if attr.startswith('__'): +# return super(DD, self).__setattr__(attr, value) + self[attr] = value + + def __str__(self): + return 'DD%s' % dict(self) + + def __repr__(self): + return str(self) + + def __deepcopy__(self, memo): + z = DD() + for k, kv in self.items(): + z[k] = copy.deepcopy(kv, memo) + return z diff --git a/Model/COSMIC/feature_extraction/multiprocessing_bpe_encoder.py b/Model/COSMIC/feature_extraction/multiprocessing_bpe_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..f0240c210f73b2731f6b6258d103a2131c812633 --- /dev/null +++ b/Model/COSMIC/feature_extraction/multiprocessing_bpe_encoder.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import contextlib +import sys + +from collections import Counter +from multiprocessing import Pool + +from fairseq.data.encoders.gpt2_bpe import get_encoder + + +def main(): + """ + Helper script to encode raw text with the GPT-2 BPE using multiple processes. + + The encoder.json and vocab.bpe files can be obtained here: + - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json + - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--encoder-json", + help='path to encoder.json', + ) + parser.add_argument( + "--vocab-bpe", + type=str, + help='path to vocab.bpe', + ) + parser.add_argument( + "--inputs", + nargs="+", + default=['-'], + help="input files to filter/encode", + ) + parser.add_argument( + "--outputs", + nargs="+", + default=['-'], + help="path to save encoded outputs", + ) + parser.add_argument( + "--keep-empty", + action="store_true", + help="keep empty lines", + ) + parser.add_argument("--workers", type=int, default=20) + args = parser.parse_args() + + assert len(args.inputs) == len(args.outputs), \ + "number of input and output paths should match" + + with contextlib.ExitStack() as stack: + inputs = [ + stack.enter_context(open(input, "r", encoding="utf-8")) + if input != "-" else sys.stdin + for input in args.inputs + ] + outputs = [ + stack.enter_context(open(output, "w", encoding="utf-8")) + if output != "-" else sys.stdout + for output in args.outputs + ] + + encoder = MultiprocessingEncoder(args) + pool = Pool(args.workers, initializer=encoder.initializer) + encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100) + + stats = Counter() + for i, (filt, enc_lines) in enumerate(encoded_lines, start=1): + if filt == "PASS": + for enc_line, output_h in zip(enc_lines, outputs): + print(enc_line, file=output_h) + else: + stats["num_filtered_" + filt] += 1 + if i % 10000 == 0: + print("processed {} lines".format(i), file=sys.stderr) + + for k, v in stats.most_common(): + print("[{}] filtered {} lines".format(k, v), file=sys.stderr) + + +class MultiprocessingEncoder(object): + + def __init__(self, args): + self.args = args + + def initializer(self): + global bpe + bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe) + + def encode(self, line): + global bpe + ids = bpe.encode(line) + return list(map(str, ids)) + + def decode(self, tokens): + global bpe + return bpe.decode(tokens) + + def encode_lines(self, lines): + """ + Encode a set of lines. All lines will be encoded together. + """ + enc_lines = [] + for line in lines: + line = line.strip() + if len(line) == 0 and not self.args.keep_empty: + return ["EMPTY", None] + tokens = self.encode(line) + enc_lines.append(" ".join(tokens)) + return ["PASS", enc_lines] + + def decode_lines(self, lines): + dec_lines = [] + for line in lines: + tokens = map(int, line.strip().split()) + dec_lines.append(self.decode(tokens)) + return ["PASS", dec_lines] + + +if __name__ == "__main__": + main() diff --git a/Model/COSMIC/feature_extraction/src/__init__.py b/Model/COSMIC/feature_extraction/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Model/COSMIC/feature_extraction/src/data/atomic.py b/Model/COSMIC/feature_extraction/src/data/atomic.py new file mode 100644 index 0000000000000000000000000000000000000000..85020a6809898915afbf57a3417f3daf9a9b22bd --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/data/atomic.py @@ -0,0 +1,337 @@ +import comet.utils.utils as utils +import comet.src.data.utils as data_utils +import comet.src.data.config as cfg + +import pandas +import json +import random +import math +import torch + +from tqdm import tqdm + + +def map_name(name): + if name == "train": + return "trn" + elif name == "test": + return "tst" + else: + return "dev" + + +class DataLoader(object): + def __init__(self, opt): + self.data = {} + self.data["train"] = {} + self.data["dev"] = {} + self.data["test"] = {} + + self.sequences = {} + self.sequences["train"] = {} + self.sequences["dev"] = {} + self.sequences["test"] = {} + + self.masks = {} + self.masks["train"] = {} + self.masks["dev"] = {} + self.masks["test"] = {} + + self.offsets = {} + self.offsets["train"] = {} + self.offsets["dev"] = {} + self.offsets["test"] = {} + + def offset_summary(self, split): + return self.offsets[split]["total"] + + +def do_take_partial_dataset(data_opts): + if data_opts.get("kr", None) is None: + return False + if data_opts.kr == 1: + return False + return True + + +def select_partial_dataset(data_opts, data): + num_selections = math.ceil(data_opts.kr * len(data)) + return random.sample(data, num_selections) + + +class GenerationDataLoader(DataLoader): + def __init__(self, opt, categories): + super(GenerationDataLoader, self).__init__(opt) + + self.categories = categories + self.opt = opt + + for split in self.data: + self.data[split] = {"total": []} + self.offsets[split] = {"total": 0} + + self.vocab_encoder = None + self.vocab_decoder = None + self.special_chars = None + self.max_event = None + self.max_effect = None + + def load_data(self, path): + if ".pickle" in path: + print("Loading data from: {}".format(path)) + data_utils.load_existing_data_loader(self, path) + + return True + + for split in self.data: + file_name = "v4_atomic_{}.csv".format(map_name(split)) + + df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0) + df.iloc[:, :9] = df.iloc[:, :9].apply( + lambda col: col.apply(json.loads)) + + for cat in self.categories: + attr = df[cat] + self.data[split]["total"] += utils.zipped_flatten(zip( + attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) + + if do_take_partial_dataset(self.opt.data): + self.data["train"]["total"] = select_partial_dataset( + self.opt.data, self.data["train"]["total"]) + + return False + + def make_tensors(self, text_encoder, special, + splits=["train", "dev", "test"], test=False): + self.vocab_encoder = text_encoder.encoder + self.vocab_decoder = text_encoder.decoder + self.special_chars = special + + sequences = {} + for split in splits: + sequences[split] = get_generation_sequences( + self.opt, self.data, split, text_encoder, test) + + self.masks[split]["total"] = [(len(i[0]), len(i[1])) for + i in sequences[split]] + + self.max_event = max([max([l[0] for l in self.masks[split]["total"]]) + for split in self.masks]) + self.max_effect = max([max([l[1] for l in self.masks[split]["total"]]) + for split in self.masks]) + + print(self.max_event) + print(self.max_effect) + + for split in splits: + num_elements = len(sequences[split]) + self.sequences[split]["total"] = torch.LongTensor( + num_elements, self.max_event + self.max_effect).fill_(0) + + for i, seq in enumerate(sequences[split]): + # print(self.sequences[split]["total"][i, :len(seq[0])].size()) + # print(torch.FloatTensor(seq[0]).size()) + self.sequences[split]["total"][i, :len(seq[0])] = \ + torch.LongTensor(seq[0]) + self.sequences[split]["total"][i, self.max_event:self.max_event + len(seq[1])] = \ + torch.LongTensor(seq[1]) + + def sample_batch(self, split, bs, idxs=None): + offset = self.offsets[split]["total"] + + batch = {} + + # Decided not to reduce computation on here because it's all parallel + # anyway and we don't want to run out of memory in cases where we + # don't see the longest version quickly enough + + if idxs: + seqs = self.sequences[split]["total"].index_select( + 0, torch.LongTensor(idxs).to( + self.sequences[split]["total"].device)) + else: + seqs = self.sequences[split]["total"][offset:offset + bs] + batch["sequences"] = seqs.to(cfg.device) + batch["attention_mask"] = make_attention_mask(seqs) + batch["loss_mask"] = make_loss_mask( + seqs, self.max_event, 1) + batch["key"] = ("total", offset, offset + bs) + + offset += seqs.size(0) + + self.offsets[split]["total"] = offset + + if split == "train" and offset + bs > len(self.sequences[split]["total"]): + return batch, True + elif offset >= len(self.sequences[split]["total"]): + return batch, True + else: + return batch, False + + def reset_offsets(self, splits=["train", "test", "dev"], + shuffle=True, keys=None): + if isinstance(splits, str): + splits = [splits] + + for split in splits: + if keys is None: + keys = ["total"] + + for key in keys: + self.offsets[split][key] = 0 + + if shuffle: + self.shuffle_sequences(split, keys) + + def shuffle_sequences(self, split="train", keys=None): + if keys is None: + # print(type(self.data)) + # print(type(self.data.keys())) + keys = self.data[split].keys() + + for key in keys: + idxs = list(range(len(self.data[split][key]))) + + random.shuffle(idxs) + + self.sequences[split][key] = \ + self.sequences[split][key].index_select( + 0, torch.LongTensor(idxs)) + + temp = [self.data[split][key][i] for i in idxs] + self.data[split][key] = temp + temp = [self.masks[split][key][i] for i in idxs] + self.masks[split][key] = temp + + +def prune_data_for_evaluation(data_loader, categories, split): + indices = [] + for i, example in enumerate(data_loader.data[split]["total"]): + if example[1] in categories: + indices.append(i) + + data_loader.masks[split]["total"] = [data_loader.masks[split]["total"][i] + for i in indices] + data_loader.sequences[split]["total"] = \ + data_loader.sequences[split]["total"].index_select( + 0, torch.LongTensor(indices)) + data_loader.data[split]["total"] = [data_loader.data[split]["total"][i] + for i in indices] + + +def make_attention_mask(sequences): + return (sequences != 0).float().to(cfg.device) + + +def make_loss_mask(sequences, max_event, num_delim_tokens): + # print(num_delim_tokens) + # print(sequences.size()) + mask = (sequences != 0).float() + mask[:, :max_event + num_delim_tokens] = 0 + return mask[:, 1:].to(cfg.device) + + +def find_underscore_length(seq): + start = "_" + + while start in seq: + start += "_" + return start[:-1] + + +def handle_underscores(suffix, text_encoder, prefix=False): + encoder = text_encoder.encoder + if prefix: + tok = "___" + else: + tok = find_underscore_length(suffix) + + suffix_parts = [i.strip() for i in suffix.split("{}".format(tok))] + to_flatten = [] + for i, part in enumerate(suffix_parts): + if part: + to_flatten.append(text_encoder.encode([part], verbose=False)[0]) + + if i != len(suffix_parts) - 1 and suffix_parts[i+1]: + to_flatten.append([encoder[""]]) + else: + to_flatten.append([encoder[""]]) + + final_suffix = utils.flatten(to_flatten) + + return final_suffix + + +def get_generation_sequences(opt, data, split, text_encoder, test): + sequences = [] + count = 0 + + final_prefix = None + final_suffix = None + + for prefix, category, suffix in tqdm(data[split]["total"]): + final_prefix, final_suffix = do_example( + text_encoder, prefix, suffix, True, True) + # if do_prefix: + # if "___" in prefix: + # final_prefix = handle_underscores(prefix, text_encoder, True) + # else: + # final_prefix = text_encoder.encode([prefix], verbose=False)[0] + # if do_suffix: + # if "_" in suffix: + # final_suffix = handle_underscores(suffix, text_encoder) + # else: + # final_suffix = text_encoder.encode([suffix], verbose=False)[0] + + final = compile_final_sequence( + opt, final_prefix, final_suffix, category, text_encoder) + + sequences.append(final) + + count += 1 + + if count > 10 and test: + break + + return sequences + + + +def do_example(text_encoder, prefix, suffix, do_prefix, do_suffix): + final_prefix = None + final_suffix = None + + if do_prefix: + if "___" in prefix: + final_prefix = handle_underscores(prefix, text_encoder, True) + else: + final_prefix = text_encoder.encode([prefix], verbose=False)[0] + if do_suffix: + if "_" in suffix: + final_suffix = handle_underscores(suffix, text_encoder) + else: + final_suffix = text_encoder.encode([suffix], verbose=False)[0] + + return final_prefix, final_suffix + + +def compile_final_sequence(opt, final_prefix, final_suffix, category, text_encoder): + final = [] + + final.append(final_prefix) + final.append( + [text_encoder.encoder[category]] + + final_suffix) + + final[-1].append(text_encoder.encoder[""]) + + return final + + +num_delimiter_tokens = { + "category": 1, + "hierarchy": 3, + "hierarchy+label": 4, + "category+hierarchy": 4, + "category+hierarchy+label": 5 +} diff --git a/Model/COSMIC/feature_extraction/src/data/conceptnet.py b/Model/COSMIC/feature_extraction/src/data/conceptnet.py new file mode 100644 index 0000000000000000000000000000000000000000..872df788249667bf721a91d3460891a4047a718f --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/data/conceptnet.py @@ -0,0 +1,342 @@ +import comet.src.data.utils as data_utils +import comet.src.data.atomic as adata +import comet.src.data.config as cfg + +import torch +import random +from tqdm import tqdm + + +def map_name(name, opt): + if name == "train": + return "train{}k.txt".format(opt.trainsize) + elif name == "test": + return "test.txt" + else: + return "dev{}.txt".format(opt.devversion) + + +conceptnet_relations = [ + 'AtLocation', 'CapableOf', 'Causes', 'CausesDesire', + 'CreatedBy', 'DefinedAs', 'DesireOf', 'Desires', 'HasA', + 'HasFirstSubevent', 'HasLastSubevent', 'HasPainCharacter', + 'HasPainIntensity', 'HasPrerequisite', 'HasProperty', + 'HasSubevent', 'InheritsFrom', 'InstanceOf', 'IsA', + 'LocatedNear', 'LocationOfAction', 'MadeOf', 'MotivatedByGoal', + 'NotCapableOf', 'NotDesires', 'NotHasA', 'NotHasProperty', + 'NotIsA', 'NotMadeOf', 'PartOf', 'ReceivesAction', 'RelatedTo', + 'SymbolOf', 'UsedFor' +] + + +split_into_words = { + 'AtLocation': "at location", + 'CapableOf': "capable of", + 'Causes': "causes", + 'CausesDesire': "causes desire", + 'CreatedBy': "created by", + 'DefinedAs': "defined as", + 'DesireOf': "desire of", + 'Desires': "desires", + 'HasA': "has a", + 'HasFirstSubevent': "has first subevent", + 'HasLastSubevent': "has last subevent", + 'HasPainCharacter': "has pain character", + 'HasPainIntensity': "has pain intensity", + 'HasPrerequisite': "has prequisite", + 'HasProperty': "has property", + 'HasSubevent': "has subevent", + 'InheritsFrom': "inherits from", + 'InstanceOf': 'instance of', + 'IsA': "is a", + 'LocatedNear': "located near", + 'LocationOfAction': "location of action", + 'MadeOf': "made of", + 'MotivatedByGoal': "motivated by goal", + 'NotCapableOf': "not capable of", + 'NotDesires': "not desires", + 'NotHasA': "not has a", + 'NotHasProperty': "not has property", + 'NotIsA': "not is a", + 'NotMadeOf': "not made of", + 'PartOf': "part of", + 'ReceivesAction': "receives action", + 'RelatedTo': "related to", + 'SymbolOf': "symbol of", + 'UsedFor': "used for" +} + + +class GenerationDataLoader(adata.DataLoader): + def __init__(self, opt, categories=None): + super(GenerationDataLoader, self).__init__(opt) + self.opt = opt + + for split in self.data: + self.data[split] = {"total": []} + self.offsets[split] = {"total": 0} + + self.vocab_encoder = None + self.vocab_decoder = None + self.special_chars = None + self.max_e1 = None + self.max_e2 = None + self.max_r = None + + def offset_summary(self, split): + return sum(self.offsets[split].values()) + + def load_data(self, path): + if ".pickle" in path: + print("Loading data from: {}".format(path)) + data_utils.load_existing_data_loader(self, path) + return True + + for split in self.data: + file_name = map_name(split, self.opt.data) + + if split != "dev" or self.opt.data.devversion != "12": + string_tuples = open("{}/{}".format( + path, file_name), "r").read().split("\n") + tuples = [x.split("\t") for x in string_tuples if x] + else: + string_tuples = open("{}/{}".format( + path, "dev1.txt"), "r").read().split("\n") + tuples = [x.split("\t") for x in string_tuples if x] + string_tuples = open("{}/{}".format( + path, "dev2.txt"), "r").read().split("\n") + tuples += [x.split("\t") for x in string_tuples if x] + + if split in ["dev", "test"]: + if self.opt.data.rel == "language": + self.data[split]["total"] = \ + [(i[1].lower().strip(), split_into_words[i[0]], + i[2].lower().strip(), int(i[3])) for i in tuples] + self.data[split]["positive"] = \ + [(i[1].lower().strip(), split_into_words[i[0]], + i[2].lower().strip(), int(i[3])) for i in tuples if int(i[3])] + self.data[split]["negative"] = \ + [(i[1].lower().strip(), split_into_words[i[0]], + i[2].lower().strip(), int(i[3])) for i in tuples if not int(i[3])] + elif self.opt.data.rel == "relation": + self.data[split]["total"] = \ + [(i[1].lower().strip(), "<{}>".format(i[0]), + i[2].lower().strip(), int(i[3])) for i in tuples] + self.data[split]["positive"] = \ + [(i[1].lower().strip(), "<{}>".format(i[0]), + i[2].lower().strip(), int(i[3])) for i in tuples if int(i[3])] + self.data[split]["negative"] = \ + [(i[1].lower().strip(), "<{}>".format(i[0]), + i[2].lower().strip(), int(i[3])) for i in tuples if not int(i[3])] + else: + if self.opt.data.rel == "language": + self.data[split]["total"] = \ + [(i[1].lower().strip(), split_into_words[i[0]], + i[2].lower().strip(), i[3]) for i in tuples] + elif self.opt.data.rel == "relation": + self.data[split]["total"] = \ + [(i[1].lower().strip(), "<{}>".format(i[0]), + i[2].lower().strip(), i[3]) for i in tuples] + + return False + + def make_tensors(self, text_encoder, special, + splits=["train", "dev", "test"], test=False): + self.vocab_encoder = text_encoder.encoder + self.vocab_decoder = text_encoder.decoder + self.special_chars = special + + sequences = {} + for split in splits: + sequences[split], discarded = get_generation_sequences( + self.data, split, text_encoder, test, self.opt.data.maxe1, + self.opt.data.maxe2) + + if split == "train": + self.data[split]["total"] = [j for i, j in enumerate( + self.data[split]["total"]) if i not in set(discarded)] + self.masks[split]["total"] = [(len(i[0]), len(i[1]), len(i[2])) for + i in sequences[split]] + + self.max_e1 = max([max([l[0] for l in self.masks[split]["total"]]) + for split in self.masks]) + self.max_r = max([max([l[1] for l in self.masks[split]["total"]]) + for split in self.masks]) + self.max_e2 = max([max([l[2] for l in self.masks[split]["total"]]) + for split in self.masks]) + + print(self.max_e1) + print(self.max_r) + print(self.max_e2) + + for split in splits: + num_elements = len(sequences[split]) + self.sequences[split]["total"] = torch.LongTensor( + num_elements, self.max_e1 + self.max_e2 + self.max_r).fill_(0) + + for i, seq in enumerate(sequences[split]): + # print(self.sequences[split]["total"][i, :len(seq[0])].size()) + # print(torch.FloatTensor(seq[0]).size()) + self.sequences[split]["total"][i, :len(seq[0])] = \ + torch.LongTensor(seq[0]) + start_r = self.max_e1 + end_r = self.max_e1 + len(seq[1]) + self.sequences[split]["total"][i, start_r:end_r] = \ + torch.LongTensor(seq[1]) + start_e2 = self.max_e1 + self.max_r + end_e2 = self.max_e1 + self.max_r + len(seq[2]) + self.sequences[split]["total"][i, start_e2:end_e2] = \ + torch.LongTensor(seq[2]) + + if split in ["test", "dev"]: + print(split) + self.sequences[split]["negative"] = \ + self.sequences[split]["total"].index_select( + 0, torch.LongTensor([i for i, j in enumerate( + self.data[split]['total']) if not j[3]])) + # self.data[split]['total'][:self.sequences[split]["total"].size(0)]) if not j[3]])) + self.sequences[split]["positive"] = \ + self.sequences[split]["total"].index_select( + 0, torch.LongTensor([i for i, j in enumerate( + self.data[split]['total']) if j[3]])) + # self.data[split]['total'][:self.sequences[split]["total"].size(0)]) if j[3]])) + + def sample_batch(self, split, bs, cat="total", idxs=None): + offset = self.offsets[split][cat] + + batch = {} + + # Decided not to reduce computation on here because it's all parallel + # anyway and we don't want to run out of memory in cases where we + # don't see the longest version quickly enough + + if idxs: + seqs = self.sequences[split][cat].index_select( + 0, torch.LongTensor(idxs).to( + self.sequences[split][cat].device)) + else: + seqs = self.sequences[split][cat][offset:offset + bs] + batch["sequences"] = seqs.to(cfg.device) + batch["attention_mask"] = make_attention_mask(seqs) + batch["loss_mask"] = make_loss_mask(seqs, self.max_e1 + self.max_r) + batch["key"] = (cat, offset, offset + bs) + + offset += seqs.size(0) + + self.offsets[split][cat] = offset + + if split == "train" and offset + bs > len(self.sequences[split][cat]): + return batch, True + elif offset >= len(self.sequences[split][cat]): + return batch, True + else: + return batch, False + + def reset_offsets(self, splits=["train", "test", "dev"], + shuffle=True, keys=None): + if isinstance(splits, str): + splits = [splits] + + for split in splits: + if keys is None: + keys = ["total", "positive", "negative"] + + for key in keys: + self.offsets[split][key] = 0 + + if shuffle: + self.shuffle_sequences(split, keys) + + def shuffle_sequences(self, split="train", keys=None): + if keys is None: + # print(type(self.data)) + # print(type(self.data.keys())) + keys = self.data[split].keys() + + for key in keys: + if key in ["positive", "negative"]: + continue + idxs = list(range(len(self.data[split][key]))) + + random.shuffle(idxs) + + self.sequences[split][key] = \ + self.sequences[split][key].index_select( + 0, torch.LongTensor(idxs)) + + temp = [self.data[split][key][i] for i in idxs] + self.data[split][key] = temp + + temp = [self.masks[split][key][i] for i in idxs] + self.masks[split][key] = temp + + +def make_attention_mask(sequences): + return (sequences != 0).float().to(cfg.device) + + +def make_loss_mask(sequences, max_event): + # print(sequences.size()) + mask = (sequences != 0).float() + mask[:, :max_event] = 0 + return mask[:, 1:].to(cfg.device) + + +def get_generation_sequences(data, split, text_encoder, test, + max_e1=10, max_e2=15): + sequences = [] + count = 0 + + final_event1 = None + final_event2 = None + final_relation = None + + discarded = [] + + for event1, relation, event2, _ in tqdm(data[split]["total"]): + e1, r, e2 = do_example(text_encoder, event1, relation, event2) + + if (split == "train" and len(e1) > max_e1 or + len(e2) > max_e2): + discarded.append(count) + count += 1 + continue + + final = compile_final_sequence( + e1, e2, r, text_encoder) + + sequences.append(final) + + count += 1 + + if count > 10 and test: + break + + return sequences, discarded + + +def do_example(text_encoder, event1, relation, event2): + final_event1 = text_encoder.encode([event1], verbose=False)[0] + if relation.lower() != relation: + final_relation = [text_encoder.encoder[relation]] + else: + final_relation = text_encoder.encode( + [relation], verbose=False)[0] + if event2 is not None: + final_event2 = text_encoder.encode([event2], verbose=False)[0] + else: + final_event2 = None + + return final_event1, final_relation, final_event2 + + +def compile_final_sequence(final_event1, final_event2, final_relation, text_encoder): + final = [] + + final.append(final_event1) + final.append(final_relation) + final.append(final_event2) + + final[-1].append(text_encoder.encoder[""]) + + return final diff --git a/Model/COSMIC/feature_extraction/src/data/data.py b/Model/COSMIC/feature_extraction/src/data/data.py new file mode 100644 index 0000000000000000000000000000000000000000..69f908dc3fa7801b794f2680e0066f04b2e1244b --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/data/data.py @@ -0,0 +1,85 @@ +import os +import comet.src.data.atomic as atomic_data +import comet.src.data.conceptnet as conceptnet_data +import comet.src.data.config as cfg + +import comet.utils.utils as utils + +import pickle +import torch +import json + + +start_token = "" +end_token = "" +blank_token = "" + + +def save_checkpoint(state, filename): + print("Saving model to {}".format(filename)) + torch.save(state, filename) + + +def save_step(model, vocab, optimizer, opt, length, lrs): + if cfg.test_save: + name = "{}.pickle".format(utils.make_name( + opt, prefix="garbage/models/", is_dir=False, eval_=True)) + else: + name = "{}.pickle".format(utils.make_name( + opt, prefix="models/", is_dir=False, eval_=True)) + save_checkpoint({ + "epoch": length, "state_dict": model.state_dict(), + "optimizer": optimizer.state_dict(), "opt": opt, + "vocab": vocab, "epoch_learning_rates": lrs}, + name) + + +def save_eval_file(opt, stats, eval_type="losses", split="dev", ext="pickle"): + if cfg.test_save: + name = "{}/{}.{}".format(utils.make_name( + opt, prefix="garbage/{}/".format(eval_type), + is_dir=True, eval_=True), split, ext) + else: + name = "{}/{}.{}".format(utils.make_name( + opt, prefix="results/{}/".format(eval_type), + is_dir=True, eval_=True), split, ext) + print("Saving {} {} to {}".format(split, eval_type, name)) + + if ext == "pickle": + with open(name, "wb") as f: + pickle.dump(stats, f) + elif ext == "txt": + with open(name, "w") as f: + f.write(stats) + elif ext == "json": + with open(name, "w") as f: + json.dump(stats, f) + else: + raise + + +def load_checkpoint(filename, gpu=True): + if os.path.exists(filename): + checkpoint = torch.load( + filename, map_location=lambda storage, loc: storage) + else: + print("No model found at {}".format(filename)) + return checkpoint + + +def make_data_loader(opt, *args): + if opt.dataset == "atomic": + return atomic_data.GenerationDataLoader(opt, *args) + elif opt.dataset == "conceptnet": + return conceptnet_data.GenerationDataLoader(opt, *args) + + +def set_max_sizes(data_loader, force_split=None): + data_loader.total_size = {} + if force_split is not None: + data_loader.total_size[force_split] = \ + data_loader.sequences[force_split]["total"].size(0) + return + for split in data_loader.sequences: + data_loader.total_size[split] = \ + data_loader.sequences[split]["total"].size(0) diff --git a/Model/COSMIC/feature_extraction/src/data/utils.py b/Model/COSMIC/feature_extraction/src/data/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5a77549e48434f494be0b99576087de5a3d724be --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/data/utils.py @@ -0,0 +1,134 @@ +import re +import ftfy +import json +import spacy +import torch + +from tqdm import tqdm + + +def load_existing_data_loader(data_loader, path): + old_data_loader = torch.load(path) + for attr in data_loader.__dict__.keys(): + if attr not in old_data_loader.__dict__.keys(): + continue + setattr(data_loader, attr, getattr(old_data_loader, attr)) + + +################################################################################ +# +# Code Below taken from HuggingFace pytorch-openai-lm repository +# +################################################################################ + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + word is represented as tuple of symbols (symbols being variable-length strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def text_standardize(text): + """ + fixes some issues the spacy tokenizer had on books corpus + also does some whitespace standardization + """ + text = text.replace('—', '-') + text = text.replace('–', '-') + text = text.replace('―', '-') + text = text.replace('…', '...') + text = text.replace('´', "'") + text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) + text = re.sub(r'\s*\n\s*', ' \n ', text) + text = re.sub(r'[^\S\n]+', ' ', text) + return text.strip() + + +class TextEncoder(object): + """ + mostly a wrapper for a public python bpe tokenizer + """ + + def __init__(self, encoder_path, bpe_path): + self.nlp = spacy.load( + 'en', disable=['parser', 'tagger', 'ner', 'textcat']) + self.encoder = json.load(open(encoder_path)) + self.decoder = {v: k for k, v in self.encoder.items()} + merges = open(bpe_path, encoding='utf-8').read().split('\n')[1:-1] + merges = [tuple(merge.split()) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + '',) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get( + pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if (word[i] == first and i < len(word) - 1 and + word[i+1] == second): + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + if word == '\n ': + word = '\n' + self.cache[token] = word + return word + + def encode(self, texts, verbose=True): + texts_tokens = [] + if verbose: + for text in tqdm(texts, ncols=80, leave=False): + text = self.nlp(text_standardize(ftfy.fix_text(text))) + text_tokens = [] + for token in text: + text_tokens.extend( + [self.encoder.get(t, 0) for t in + self.bpe(token.text.lower()).split(' ')]) + texts_tokens.append(text_tokens) + else: + for text in texts: + text = self.nlp(text_standardize(ftfy.fix_text(text))) + text_tokens = [] + for token in text: + text_tokens.extend( + [self.encoder.get(t, 0) for t in + self.bpe(token.text.lower()).split(' ')]) + texts_tokens.append(text_tokens) + return texts_tokens diff --git a/Model/COSMIC/feature_extraction/src/evaluate/atomic_evaluate.py b/Model/COSMIC/feature_extraction/src/evaluate/atomic_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..e22139b092b0326be36c95f8ed07a5eb15254b70 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/evaluate/atomic_evaluate.py @@ -0,0 +1,40 @@ +import comet.src.train.batch as batch +import comet.src.evaluate.evaluate as base_evaluate +import numpy as np + +def make_evaluator(opt, *args): + if opt.exp == "generation": + return AtomicGenerationEvaluator(opt, *args) + else: + return AtomicClassificationEvaluator(opt, *args) + + +class AtomicGenerationEvaluator(base_evaluate.Evaluator): + def __init__(self, opt, model, data_loader): + super(AtomicGenerationEvaluator, self).__init__( + opt, model, data_loader) + + self.batch = batch.batch_atomic_generate + + def initialize_losses(self): + average_loss = {"total_micro": 0, "total_macro": 0} + nums = {"total_micro": 0, "total_macro": 0} + return average_loss, nums + + def compute_final_scores(self, average_loss, nums): + average_loss["total_macro"] /= nums["total_macro"] + average_loss["total_micro"] /= nums["total_micro"] + + average_loss["ppl_macro"] = np.exp(average_loss["total_macro"]) + average_loss["ppl_micro"] = np.exp(average_loss["total_micro"]) + + return average_loss + + def counter(self, nums): + return nums["total_macro"] + + def print_result(self, split, epoch_losses): + print("{} Loss: \t {}".format( + split, epoch_losses["total_micro"])) + print("{} Perplexity: \t {}".format( + split, epoch_losses["ppl_micro"])) diff --git a/Model/COSMIC/feature_extraction/src/evaluate/conceptnet_evaluate.py b/Model/COSMIC/feature_extraction/src/evaluate/conceptnet_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..197506aeacef6c1b77f7f9a51a36ca3f95cd85e1 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/evaluate/conceptnet_evaluate.py @@ -0,0 +1,82 @@ +import time +import numpy as np + +import comet.src.train.batch as batch_utils +import comet.utils.utils as utils +import comet.src.evaluate.evaluate as base_evaluate + + +def make_evaluator(opt, *args, **kwargs): + return ConceptNetGenerationEvaluator(opt, *args, **kwargs) + + +class ConceptNetGenerationEvaluator(base_evaluate.Evaluator): + def __init__(self, opt, model, data_loader, track=False): + super(ConceptNetGenerationEvaluator, self).__init__( + opt, model, data_loader) + + if track: + self.tracker = {"positive": [], "negative": []} + else: + self.tracker = None + + def batch(self, opt, nums, average_loss, batch_variables, eval_mode): + batch_variables["category"] = self.current_category + + outputs = batch_utils.batch_conceptnet_generate( + opt, nums, average_loss, batch_variables, eval_mode, + tracking_mode=self.tracker is not None) + + if outputs.get("tracking", None) is not None: + self.tracker[self.current_category] += outputs["tracking"] + + if outputs["reset"] and batch_variables["category"] == "positive": + outputs["reset"] = False + self.current_category = "negative" + + return outputs + + def initialize_losses(self): + average_loss = {"total_micro": 0, "total_macro": 0, + "negative_micro": 0, "negative_macro": 0} + nums = {"total_micro": 0, "total_macro": 0, + "negative_micro": 0, "negative_macro": 0} + + self.current_category = "positive" + + if self.tracker is not None: + self.tracker = {"positive": [], "negative": []} + + return average_loss, nums + + def compute_final_scores(self, average_loss, nums): + average_loss["total_macro"] /= nums["total_macro"] + average_loss["total_micro"] /= nums["total_micro"] + + if nums["negative_micro"]: + average_loss["negative_macro"] /= nums["negative_macro"] + average_loss["negative_micro"] /= nums["negative_micro"] + else: + average_loss["negative_macro"] = 0 + average_loss["negative_micro"] = 0 + + average_loss["macro_diff"] = (average_loss["negative_macro"] - + average_loss["total_macro"]) + average_loss["micro_diff"] = (average_loss["negative_micro"] - + average_loss["total_micro"]) + + average_loss["ppl_macro"] = np.exp(average_loss["total_macro"]) + average_loss["ppl_micro"] = np.exp(average_loss["total_micro"]) + + return average_loss + + def counter(self, nums): + return nums["total_macro"] + + def print_result(self, split, epoch_losses): + print("{} Loss: \t {}".format( + split, epoch_losses["total_micro"])) + print("{} Diff: \t {}".format( + split, epoch_losses["micro_diff"])) + print("{} Perplexity: \t {}".format( + split, epoch_losses["ppl_micro"])) diff --git a/Model/COSMIC/feature_extraction/src/evaluate/conceptnet_generate.py b/Model/COSMIC/feature_extraction/src/evaluate/conceptnet_generate.py new file mode 100644 index 0000000000000000000000000000000000000000..3e4302416cb7548e790ad5abd8e9339c5333629b --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/evaluate/conceptnet_generate.py @@ -0,0 +1,112 @@ +import time +import torch + +import comet.src.evaluate.generate as base_generate +import comet.src.evaluate.sampler as sampling +import comet.utils.utils as utils +import comet.src.data.config as cfg + + +def make_generator(opt, *args): + return ConceptNetGenerator(opt, *args) + + +class ConceptNetGenerator(base_generate.Generator): + def __init__(self, opt, model, data_loader): + self.opt = opt + + self.model = model + self.data_loader = data_loader + + self.sampler = sampling.make_sampler( + opt.eval.sample, opt, data_loader) + + def reset_sequences(self): + return [] + + def generate(self, split="dev"): + print("Generating Sequences") + + # Set evaluation mode + self.model.eval() + + # Reset evaluation set for dataset split + self.data_loader.reset_offsets(splits=split, shuffle=False) + + start = time.time() + count = 0 + sequences = None + + # Reset generated sequence buffer + sequences = self.reset_sequences() + + # Initialize progress bar + bar = utils.set_progress_bar( + self.data_loader.total_size[split] / 2) + + reset = False + + with torch.no_grad(): + # Cycle through development set + while not reset: + + start = len(sequences) + # Generate a single batch + reset = self.generate_batch(sequences, split, bs=1) + + end = len(sequences) + + if not reset: + bar.update(end - start) + else: + print(end) + + count += 1 + + if cfg.toy and count > 10: + break + if (self.opt.eval.gs != "full" and (count > opt.eval.gs)): + break + + torch.cuda.synchronize() + print("{} generations completed in: {} s".format( + split, time.time() - start)) + + # Compute scores for sequences (e.g., BLEU, ROUGE) + # Computes scores that the generator is initialized with + # Change define_scorers to add more scorers as possibilities + # avg_scores, indiv_scores = self.compute_sequence_scores( + # sequences, split) + avg_scores, indiv_scores = None, None + + return sequences, avg_scores, indiv_scores + + def generate_batch(self, sequences, split, verbose=False, bs=1): + # Sample batch from data loader + batch, reset = self.data_loader.sample_batch( + split, bs=bs, cat="positive") + + start_idx = self.data_loader.max_e1 + self.data_loader.max_r + max_end_len = self.data_loader.max_e2 + + context = batch["sequences"][:, :start_idx] + reference = batch["sequences"][:, start_idx:] + init = "".join([self.data_loader.vocab_decoder[i].replace( + '', ' ') for i in context[:, :self.data_loader.max_e1].squeeze().tolist() if i]).strip() + + start = self.data_loader.max_e1 + end = self.data_loader.max_e1 + self.data_loader.max_r + + attr = "".join([self.data_loader.vocab_decoder[i].replace( + '', ' ') for i in context[:, start:end].squeeze(0).tolist() if i]).strip() + + # Decode sequence + sampling_result = self.sampler.generate_sequence( + batch, self.model, self.data_loader, start_idx, max_end_len) + + sampling_result["key"] = batch["key"] + sampling_result["e1"] = init + sampling_result["r"] = attr + sequences.append(sampling_result) + + return reset diff --git a/Model/COSMIC/feature_extraction/src/evaluate/evaluate.py b/Model/COSMIC/feature_extraction/src/evaluate/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..9a761a820266a410716b42f171ed61d93f0e3ce5 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/evaluate/evaluate.py @@ -0,0 +1,85 @@ +import time +import torch + +import comet.utils.utils as utils +import comet.src.data.config as cfg + + +class Evaluator(object): + def __init__(self, opt, model, data_loader): + super(Evaluator, self).__init__() + + self.data_loader = data_loader + self.model = model + + self.batch_variables = { + "model": model, + "data": data_loader + } + + self.opt = opt + + def validate(self, l, split="dev", losses={}, keyset=None): + self.batch_variables["split"] = split + print("Evaluating {}".format(split)) + + epoch_losses = self.epoch( + self.opt, self.model, self.data_loader, split, keyset) + + self.print_result(split, epoch_losses) + + for loss_name, loss_val in epoch_losses.items(): + losses.setdefault(loss_name, {}) + losses[loss_name][l] = loss_val + + def epoch(self, opt, model, data_loader, split, keyset=None): + average_loss, nums = self.initialize_losses() + + data_loader.reset_offsets(splits=split, shuffle=False) + + # Set evaluation mode + model.eval() + + start = time.time() + + # Initialize progress bar + bar = utils.set_progress_bar( + data_loader.total_size[split]) + + reset = False + + with torch.no_grad(): + while not reset: + + start = data_loader.offset_summary(split) + + outputs = self.batch( + opt, nums, average_loss, + self.batch_variables, eval_mode=True) + + end = data_loader.offset_summary(split) + + reset = outputs["reset"] + + if not reset: + bar.update(end - start) + else: + print(end) + + if cfg.toy and self.counter(nums) > 100: + break + if (opt.eval.es != "full" and + (self.counter(nums) > opt.eval.es)): + break + + nums = outputs["nums"] + + torch.cuda.synchronize() + + print("{} evaluation completed in: {} s".format( + split.capitalize(), time.time() - start)) + + average_loss = self.compute_final_scores( + average_loss, nums) + + return average_loss diff --git a/Model/COSMIC/feature_extraction/src/evaluate/generate.py b/Model/COSMIC/feature_extraction/src/evaluate/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..cc1d2830b03ad58eeb3f82d6d3bd2f9246e359fd --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/evaluate/generate.py @@ -0,0 +1,72 @@ +import comet.src.data.data as data +import comet.src.data.config as cfg +import comet.src.evaluate.sampler as sampling + + +def do_gen_run(opt, generator, l, split="dev", scores={}): + # Generate sequences for examples in evaluation set using + # current trained model + + if opt.eval.gs == "full": + sequences, avg_scores, indiv_scores = generator.generate(split) + else: + sequences, avg_scores, indiv_scores = generator.generate_some(split) + + if avg_scores is not None: + # Record scores from generated sequences + for score_name, score_val in avg_scores.items(): + scores.setdefault(score_name, {}) + scores[score_name].setdefault(l, []) + scores[score_name][l] += [score_val] + + # Save generated sequences + save_sequences(opt, sequences, avg_scores, indiv_scores, + l, split, opt.eval.gs == "full", + generator.data_loader) + + +def save_sequences(opt, sequences, avg_scores, indiv_scores, + l, split, full, data_loader): + # This seems a bit roundabout since l = opt.train.dynamic in train.py + # But it's in case we start checkpointing outside of epoch boundaries + opt.train.dynamic.epoch = l + + if cfg.save: + if full: + names = {"gens": "gens", "scores": "scores", + "indiv": "indiv.scores"} + else: + names = {"gens": "gens.small", "scores": "scores.small", + "indiv": "indiv.scores.small"} + # Save generated sequences + data.save_eval_file(opt, sequences, names["gens"], split) + + if avg_scores is not None: + # Save average scores over evaluation set for generated sequences + # Scores computed are the ones the generator was initialized with + data.save_eval_file(opt, avg_scores, names["scores"], split) + + if split == "dev": + # Save individual scores + data.save_eval_file( + opt, indiv_scores, names["indiv"], split) + + +class Generator(object): + def __init__(self, opt, model, data_loader, scorers, reward_function=None): + super(Generator, self).__init__() + self.opt = opt + + self.model = model + self.data_loader = data_loader + + self.sampler = sampling.make_sampler( + opt.eval.sample, opt, data_loader) + + + def generate(self, split="dev"): + pass + + def generate_batch(self, sequences, split, verbose=False, bs=32): + pass + diff --git a/Model/COSMIC/feature_extraction/src/evaluate/sampler.py b/Model/COSMIC/feature_extraction/src/evaluate/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..518a641549ea1e1baabc200634531641dc473002 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/evaluate/sampler.py @@ -0,0 +1,329 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comet.src.data.data as data +import comet.src.data.config as cfg +import comet.src.models.utils as model_utils +import comet.src.evaluate.utils as eval_utils +import comet.src.train.batch as batch_utils + +def make_sampler(sampler_type, opt, *args, **kwargs): + print("Initializing Greedy Sampler") + return GreedySampler(opt, *args, **kwargs) + +class Sampler(): + def __init__(self, opt, data_loader, batch_mode=False): + # Token on which to end sampling + self.end_token = data_loader.vocab_encoder[data.end_token] + + self.opt = opt + + def generate_sequence(self, batch, model): + raise + + +class GreedySampler(Sampler): + def __init__(self, opt, data_loader, batch_mode=True): + super(GreedySampler, self).__init__(opt, data_loader) + + def append_batch(self, X, next_idx, mask): + next_pos = X[:, -1:, 1] + 1 + next_x = torch.cat((next_idx, next_pos), -1).unsqueeze(1) + next_mask = torch.cat([mask, torch.ones(X.size(0), 1, device=mask.device)], 1) + return torch.cat((X, next_x), 1), next_mask + + def generate_sequence(self, batch, model, data_loader, start_idx, end_len): + XMB = batch["sequences"][:, :start_idx] + MMB = batch["attention_mask"][:, :start_idx] + + XMB = model_utils.prepare_position_embeddings( + self.opt, data_loader.vocab_encoder, XMB.unsqueeze(-1)) + + _, lp = model( + XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + + values, indices = lm_probs[:, -1, :].max(dim=-1) + seqs = indices.clone().unsqueeze(1) + + loss = values + counts = 1 + next_pos = XMB[:, -1:, 1] + 1 + next_x = torch.cat((indices.view(-1, 1), next_pos), -1).unsqueeze(1) + XMB = torch.cat((XMB, next_x), 1) + MMB = torch.cat([MMB, torch.ones(XMB.size(0), 1, device=MMB.device)], 1) + + # Sample from top k + + for _ in range(self.opt.eval.smax): + _, lp = model( + XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + + # Sample from top k + values, next_idx = lm_probs[:, -1, :].max(dim=-1) + + loss += values + counts += 1 + + next_idx = next_idx.unsqueeze(1) + + seqs = torch.cat([seqs, next_idx], 1) + + if (next_idx.item() == self.end_token) or (_ == end_len - 1): + break + + XMB, MMB = self.append_batch(XMB, next_idx, MMB) + + beams = [] + + for beam in seqs: + beams.append(" ".join("".join( + [data_loader.vocab_decoder[tok.item()].replace( + '', ' ').replace('\n', '') + for tok in beam if tok != self.end_token]).split())) + + sampling_result = { + "sequence": beams[0], + "beams": beams, + "beam_losses": [loss.item()], + "loss": loss.item(), + "beam_lengths": [counts], + "length": counts + } + + return sampling_result + + +class TopKSampler(Sampler): + def __init__(self, opt, data_loader, batch_mode=True): + super(TopKSampler, self).__init__(opt, data_loader) + + def append_batch(self, X, next_idx, mask): + next_pos = X[:, -1:, 1] + 1 + next_x = torch.cat((next_idx, next_pos), -1).unsqueeze(1) + next_mask = torch.cat([mask, torch.ones(X.size(0), 1, device=mask.device)], 1) + return torch.cat((X, next_x), 1), next_mask + + def generate_sequence(self, batch, model, data_loader, start_idx, end_len): + # start_idx = context_size_event + 1 + # start_idx = max_e1 + max_r + # end_idx = context_size_effect - 1 + # end_idx = max_e2 + XMB = batch["sequences"][:, :start_idx] + MMB = batch["attention_mask"][:, :start_idx] + + XMB = model_utils.prepare_position_embeddings( + self.opt, data_loader.vocab_encoder, XMB.unsqueeze(-1)) + + _, lp = model( + XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + + values, indices = lm_probs[:, -1, :].topk(self.opt.eval.k) + seqs = indices.t().clone() + + losses = - values.view(-1, 1) + + ended = (seqs == self.end_token).float() + counts = (1 - ended) + XMB = XMB.repeat(self.opt.eval.k, 1, 1) + MMB = MMB.repeat(self.opt.eval.k, 1) + next_pos = XMB[:, -1:, 1] + 1 + next_x = torch.cat((indices.view(self.opt.eval.k, -1), next_pos), -1).unsqueeze(1) + XMB = torch.cat((XMB, next_x), 1) + MMB = torch.cat([MMB, torch.ones(XMB.size(0), 1, device=MMB.device)], 1) + + # Sample from top k + + for _ in range(end_len): + _, lp = model(XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + + # Sample from top k + values, indices = lm_probs[:, -1, :].topk(self.opt.eval.k) + choice = torch.multinomial(values.exp(), 1) + next_idx = indices.gather(-1, choice) + + ended = ended + (next_idx == self.end_token).float() * (1 - ended) + + next_idx = next_idx * (1 - ended).long() + ended.long() * self.end_token + + counts += (1 - ended) + + seqs = torch.cat([seqs, next_idx], 1) + + if ended.sum().item() == self.opt.eval.k: + break + + losses -= values.gather(-1, choice) * (1 - ended) + + XMB, MMB = self.append_batch(XMB, next_idx, MMB) + + beams = [] + + for beam in seqs: + beams.append(" ".join("".join( + [data_loader.vocab_decoder[tok.item()].replace( + '', ' ').replace('\n', '') + for tok in beam if tok != self.end_token]).split())) + + sampling_result = { + "sequence": beams[0], + "beams": beams, + "beam_losses": losses.squeeze().tolist(), + "loss": losses[0].item(), + "beam_lengths": counts.long().squeeze().tolist(), + "length": counts[0].long().item() + } + + return sampling_result + + +class BeamSampler(TopKSampler): + def __init__(self, opt, data_loader, batch_mode=True, scorer=None): + super(BeamSampler, self).__init__(opt, data_loader, batch_mode) + + self.kill_mask = torch.ones(opt.eval.bs, opt.eval.bs).to(cfg.device) * 9000 + self.kill_mask[:, 0] = 0 + + def make_batch(self, X): + X = np.array(X) + assert X.ndim in [1, 2] + if X.ndim == 1: + X = np.expand_dims(X, axis=0) + pos_enc = np.arange(n_vocab + n_special, n_vocab + n_special + X.shape[-1]) + pos_enc = np.expand_dims(pos_enc, axis=0) + batch = np.stack([X, pos_enc], axis=-1) + batch = torch.tensor(batch, dtype=torch.long).to(device) + return batch + + def append_batch(self, X, beam_toks, mask): + next_pos = X[:, -1:, 1] + 1 + next_x = torch.cat((beam_toks.unsqueeze(1), next_pos), -1).unsqueeze(1) + next_mask = torch.cat([mask, torch.ones(X.size(0), 1, device=mask.device)], 1) + return torch.cat((X, next_x), 1), next_mask + + def generate_sequence(self, batch, model, data_loader, start_idx, end_len): + # start_idx = context_size_event + 1 + # start_idx = max_e1 + max_r + # end_idx = context_size_effect - 1 + # end_idx = max_e2 + XMB = batch["sequences"][:, :start_idx] + MMB = batch["attention_mask"][:, :start_idx] + + XMB = model_utils.prepare_position_embeddings( + self.opt, data_loader.vocab_encoder, XMB.unsqueeze(-1)) + + tokens = [] + beam_losses = [] + # Beam Search + beam_lls, beam_toks, beam_seqs = None, None, None + _, lp = model(XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + dist = lm_probs[:, -1, :].squeeze() + beam_lls, beam_toks = dist.topk(self.opt.eval.bs) + beam_losses.append(beam_lls) + + ended = (beam_toks == self.end_token).float() + counts = (2 - ended) + beam_toks = beam_toks.unsqueeze(1) + beam_seqs = beam_toks.clone() + XMB = XMB.repeat(self.opt.eval.bs, 1, 1) + MMB = MMB.repeat(self.opt.eval.bs, 1) + next_pos = XMB[:, -1:, 1] + 1 + next_x = torch.cat((beam_toks, next_pos), -1).unsqueeze(1) + XMB = torch.cat((XMB, next_x), 1) + MMB = torch.cat([MMB, torch.ones(XMB.size(0), 1, device=MMB.device)], 1) + + for _ in range(end_len): + + # Compute distribution for current beam + _, lp = model( + XMB.unsqueeze(1), sequence_mask=MMB) + lm_probs = F.log_softmax(lp, dim=-1) + dist = lm_probs[:, -1, :].squeeze() + + # get hypothesis tokens for distribution + hyp_beam_lls, hyp_beam_toks = dist.topk(self.opt.eval.bs) + + # Compute masks and expand beam + expanded_ended = ended.unsqueeze(1).repeat(1, self.opt.eval.bs) + hypothesis_mask = expanded_ended * self.kill_mask + (1 - expanded_ended) + + paper_results = False + + if paper_results: + # Results from paper with slightly buggy beam search + current_beam_lls = beam_lls.unsqueeze(1).repeat( + 1, self.opt.eval.bs).view(self.opt.eval.bs**2) + else: + # Current beam search implementation + current_beam_lls = beam_losses[-1].unsqueeze(1).repeat( + 1, self.opt.eval.bs).view(self.opt.eval.bs**2) + + # Compute losses of hypotheses, masking those that have ended + hyp_beam_lls = (hyp_beam_lls.view(self.opt.eval.bs**2) * + hypothesis_mask.view(-1)) + current_beam_lls + + # Get normalizer for sequences + temp_counts = counts.unsqueeze(1).repeat(1, self.opt.eval.bs).view( + self.opt.eval.bs ** 2) + + # Select best beams with lowest aggregate loss + beam_lls, top_beam_idxs = (hyp_beam_lls / temp_counts).topk(self.opt.eval.bs) + + # Update placements in beam based on selecetion + beam_losses = [i.index_select(0, top_beam_idxs // self.opt.eval.bs) + for i in beam_losses] + ended = ended.index_select(0, top_beam_idxs // self.opt.eval.bs) + counts = temp_counts.index_select(0, top_beam_idxs) + + # Save beam losses + beam_losses.append(beam_lls * counts) + + # Update beam tokens + ended_mask = (1 - ended).long() + end_replacement = (self.end_token * ended).long() + next_toks = hyp_beam_toks.view(-1)[top_beam_idxs] + beam_toks = next_toks * ended_mask + end_replacement + + # Update ended and counts + ended = ended + (beam_toks == self.end_token).float() * (1 - ended) + counts = counts + (1 - ended) + + # Update beam sequences + beam_seqs = beam_seqs.t().repeat(self.opt.eval.bs, 1).t().contiguous().view( + self.opt.eval.bs**2, -1)[top_beam_idxs] + beam_seqs = torch.cat((beam_seqs, beam_toks.unsqueeze(1)), dim=1) + + # I have no idea what's going on but Ari's on point with it + XMB = XMB.transpose(0, 1).transpose(1, 2).repeat( + self.opt.eval.bs, 1, 1).transpose(2, 1).transpose( + 1, 0).contiguous().view( + self.opt.eval.bs**2, XMB.size(1), XMB.size(2))[top_beam_idxs] + + XMB, MMB = self.append_batch(XMB, beam_toks, MMB) + + if (beam_toks == self.end_token).sum().item() == self.opt.eval.bs: + break + + beams = [] + + for beam in beam_seqs: + beams.append(" ".join("".join( + [data_loader.vocab_decoder[tok.item()].replace( + '', ' ').replace('\n', '') + for tok in beam if tok != self.end_token]).split())) + + sampling_result = { + "sequence": beams[0], + "beams": beams, + "beam_losses": beam_lls.tolist(), + "loss": beam_lls[0].item(), + "beam_lengths": counts.tolist(), + "length": counts[0].item() + } + + return sampling_result diff --git a/Model/COSMIC/feature_extraction/src/evaluate/utils.py b/Model/COSMIC/feature_extraction/src/evaluate/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca71e091988215268645a13dcb38448cd05cff1 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/evaluate/utils.py @@ -0,0 +1,39 @@ + +def update_classification_losses(losses, nums, name, bs, loss): + if not isinstance(loss, float): + print(type(loss)) + raise + + nums[name] += bs + + losses[name] += loss * bs + + +def update_generation_losses(losses, nums, micro, macro, bs, length, loss): + # Update Losses + nums[macro] += bs + + if isinstance(length, int): + update_indiv_generation_losses( + losses, nums, micro, macro, bs, length, loss) + else: + update_tensor_generation_losses( + losses, nums, micro, macro, bs, length, loss) + + +def update_indiv_generation_losses(losses, nums, micro, + macro, bs, length, loss): + nums[micro] += bs * length + + batch_loss = loss * bs + + losses[micro] += batch_loss + losses[macro] += batch_loss / length + + +def update_tensor_generation_losses(losses, nums, micro, + macro, bs, length, loss): + nums[micro] += length.sum().item() + + losses[micro] += loss.sum().item() + losses[macro] += (loss / length.float()).sum().item() diff --git a/Model/COSMIC/feature_extraction/src/interactive/functions.py b/Model/COSMIC/feature_extraction/src/interactive/functions.py new file mode 100644 index 0000000000000000000000000000000000000000..70d97d3262894d8990bc633d46e553f1a3506ef3 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/interactive/functions.py @@ -0,0 +1,328 @@ +import torch + +from comet.src.data.utils import TextEncoder +import comet.src.data.config as cfg +import comet.src.data.data as data +import comet.src.models.models as models +from comet.src.evaluate.sampler import BeamSampler, GreedySampler, TopKSampler + +import comet.utils.utils as utils + + +def load_model_file(model_file): + model_stuff = data.load_checkpoint(model_file) + opt = model_stuff["opt"] + state_dict = model_stuff["state_dict"] + + return opt, state_dict + +def load_data(dataset, opt): + if dataset == "atomic": + data_loader = load_atomic_data(opt) + elif dataset == "conceptnet": + data_loader = load_conceptnet_data(opt) + + # Initialize TextEncoder + encoder_path = "comet/model/encoder_bpe_40000.json" + bpe_path = "comet/model/vocab_40000.bpe" + text_encoder = TextEncoder(encoder_path, bpe_path) + text_encoder.encoder = data_loader.vocab_encoder + text_encoder.decoder = data_loader.vocab_decoder + + return data_loader, text_encoder + + +def load_atomic_data(opt): + # Hacky workaround, you may have to change this + # if your models use different pad lengths for e1, e2, r + if opt.data.get("maxe1", None) is None: + opt.data.maxe1 = 17 + opt.data.maxe2 = 35 + opt.data.maxr = 1 + # path = "data/atomic/processed/generation/{}.pickle".format( + # utils.make_name_string(opt.data)) + path = "comet/data/atomic/processed/generation/categories_oEffect#oReact#oWant#xAttr#xEffect#xIntent#xNeed#xReact#xWant-maxe1_17-maxe2_35-maxr_1.pickle" + data_loader = data.make_data_loader(opt, opt.data.categories) + loaded = data_loader.load_data(path) + + return data_loader + + +def load_conceptnet_data(opt): + # Hacky workaround, you may have to change this + # if your models use different pad lengths for r + if opt.data.get("maxr", None) is None: + if opt.data.rel == "language": + opt.data.maxr = 5 + else: + opt.data.maxr = 1 + path = "comet/data/conceptnet/processed/generation/{}.pickle".format( + utils.make_name_string(opt.data)) + data_loader = data.make_data_loader(opt) + loaded = data_loader.load_data(path) + return data_loader + + +def make_model(opt, n_vocab, n_ctx, state_dict): + model = models.make_model( + opt, n_vocab, n_ctx, None, load=False, + return_acts=True, return_probs=False) + + models.load_state_dict(model, state_dict) + + model.eval() + return model + + +def set_sampler(opt, sampling_algorithm, data_loader): + if "beam" in sampling_algorithm: + opt.eval.bs = int(sampling_algorithm.split("-")[1]) + sampler = BeamSampler(opt, data_loader) + elif "topk" in sampling_algorithm: + # print("Still bugs in the topk sampler. Use beam or greedy instead") + # raise NotImplementedError + opt.eval.k = int(sampling_algorithm.split("-")[1]) + sampler = TopKSampler(opt, data_loader) + else: + sampler = GreedySampler(opt, data_loader) + + return sampler + + +def get_atomic_sequence(input_event, model, sampler, data_loader, text_encoder, category): + if isinstance(category, list): + outputs = {} + for cat in category: + new_outputs = get_atomic_sequence( + input_event, model, sampler, data_loader, text_encoder, cat) + outputs.update(new_outputs) + return outputs + elif category == "all": + outputs = {} + + for category in data_loader.categories: + new_outputs = get_atomic_sequence( + input_event, model, sampler, data_loader, text_encoder, category) + outputs.update(new_outputs) + return outputs + else: + + sequence_all = {} + + sequence_all["event"] = input_event + sequence_all["effect_type"] = category + + with torch.no_grad(): + + batch = set_atomic_inputs( + input_event, category, data_loader, text_encoder) + + sampling_result = sampler.generate_sequence( + batch, model, data_loader, data_loader.max_event + + data.atomic_data.num_delimiter_tokens["category"], + data_loader.max_effect - + data.atomic_data.num_delimiter_tokens["category"]) + + sequence_all['beams'] = sampling_result["beams"] + + # print_atomic_sequence(sequence_all) + + return {category: sequence_all} + + +def print_atomic_sequence(sequence_object): + input_event = sequence_object["event"] + category = sequence_object["effect_type"] + + print("Input Event: {}".format(input_event)) + print("Target Effect: {}".format(category)) + print("") + print("Candidate Sequences:") + for beam in sequence_object["beams"]: + print(beam) + print("") + print("====================================================") + print("") + + +def set_atomic_inputs(input_event, category, data_loader, text_encoder): + XMB = torch.zeros(1, data_loader.max_event + 1).long().to(cfg.device) + prefix, suffix = data.atomic_data.do_example(text_encoder, input_event, None, True, None) + + if len(prefix) > data_loader.max_event + 1: + prefix = prefix[:data_loader.max_event + 1] + + XMB[:, :len(prefix)] = torch.LongTensor(prefix) + XMB[:, -1] = torch.LongTensor([text_encoder.encoder["<{}>".format(category)]]) + + batch = {} + batch["sequences"] = XMB + batch["attention_mask"] = data.atomic_data.make_attention_mask(XMB) + + return batch + + +def get_conceptnet_sequence(e1, model, sampler, data_loader, text_encoder, relation, force=False): + if isinstance(relation, list): + outputs = {} + + for rel in relation: + new_outputs = get_conceptnet_sequence( + e1, model, sampler, data_loader, text_encoder, rel) + outputs.update(new_outputs) + return outputs + elif relation == "all": + outputs = {} + + for relation in data.conceptnet_data.conceptnet_relations: + new_outputs = get_conceptnet_sequence( + e1, model, sampler, data_loader, text_encoder, relation) + outputs.update(new_outputs) + return outputs + else: + + sequence_all = {} + + sequence_all["e1"] = e1 + sequence_all["relation"] = relation + + with torch.no_grad(): + if data_loader.max_r != 1: + relation_sequence = data.conceptnet_data.split_into_words[relation] + else: + relation_sequence = "<{}>".format(relation) + + batch, abort = set_conceptnet_inputs( + e1, relation_sequence, text_encoder, + data_loader.max_e1, data_loader.max_r, force) + + if abort: + return {relation: sequence_all} + + sampling_result = sampler.generate_sequence( + batch, model, data_loader, + data_loader.max_e1 + data_loader.max_r, + data_loader.max_e2) + + sequence_all['beams'] = sampling_result["beams"] + + print_conceptnet_sequence(sequence_all) + + return {relation: sequence_all} + + +def set_conceptnet_inputs(input_event, relation, text_encoder, max_e1, max_r, force): + abort = False + + e1_tokens, rel_tokens, _ = data.conceptnet_data.do_example(text_encoder, input_event, relation, None) + + if len(e1_tokens) > max_e1: + if force: + XMB = torch.zeros(1, len(e1_tokens) + max_r).long().to(cfg.device) + else: + XMB = torch.zeros(1, max_e1 + max_r).long().to(cfg.device) + return {}, True + else: + XMB = torch.zeros(1, max_e1 + max_r).long().to(cfg.device) + + XMB[:, :len(e1_tokens)] = torch.LongTensor(e1_tokens) + XMB[:, max_e1:max_e1 + len(rel_tokens)] = torch.LongTensor(rel_tokens) + + batch = {} + batch["sequences"] = XMB + batch["attention_mask"] = data.conceptnet_data.make_attention_mask(XMB) + + return batch, abort + + +def print_conceptnet_sequence(sequence_object): + e1 = sequence_object["e1"] + relation = sequence_object["relation"] + + print("Input Entity: {}".format(e1)) + print("Target Relation: {}".format(relation)) + print("") + print("Candidate Sequences:") + for beam in sequence_object["beams"]: + print(beam) + print("") + print("====================================================") + print("") + + +def print_help(data): + print("") + if data == "atomic": + print("Provide a seed event such as \"PersonX goes to the mall\"") + print("Don't include names, instead replacing them with PersonX, PersonY, etc.") + print("The event should always have PersonX included") + if data == "conceptnet": + print("Provide a seed entity such as \"go to the mall\"") + print("Because the model was trained on lemmatized entities,") + print("it works best if the input entities are also lemmatized") + print("") + + +def print_relation_help(data): + print_category_help(data) + + +def print_category_help(data): + print("") + if data == "atomic": + print("Enter a possible effect type from the following effect types:") + print("all - compute the output for all effect types {{oEffect, oReact, oWant, xAttr, xEffect, xIntent, xNeed, xReact, xWant}}") + print("oEffect - generate the effect of the event on participants other than PersonX") + print("oReact - generate the reactions of participants other than PersonX to the event") + print("oEffect - generate what participants other than PersonX may want after the event") + elif data == "conceptnet": + print("Enter a possible relation from the following list:") + print("") + print('AtLocation') + print('CapableOf') + print('Causes') + print('CausesDesire') + print('CreatedBy') + print('DefinedAs') + print('DesireOf') + print('Desires') + print('HasA') + print('HasFirstSubevent') + print('HasLastSubevent') + print('HasPainCharacter') + print('HasPainIntensity') + print('HasPrerequisite') + print('HasProperty') + print('HasSubevent') + print('InheritsFrom') + print('InstanceOf') + print('IsA') + print('LocatedNear') + print('LocationOfAction') + print('MadeOf') + print('MotivatedByGoal') + print('NotCapableOf') + print('NotDesires') + print('NotHasA') + print('NotHasProperty') + print('NotIsA') + print('NotMadeOf') + print('PartOf') + print('ReceivesAction') + print('RelatedTo') + print('SymbolOf') + print('UsedFor') + print("") + print("NOTE: Capitalization is important") + else: + raise + print("") + +def print_sampling_help(): + print("") + print("Provide a sampling algorithm to produce the sequence with from the following:") + print("") + print("greedy") + print("beam-# where # is the beam size") + print("topk-# where # is k") + print("") diff --git a/Model/COSMIC/feature_extraction/src/main.py b/Model/COSMIC/feature_extraction/src/main.py new file mode 100644 index 0000000000000000000000000000000000000000..703cd3d05ca4e5e19db03a4bd1a309de0ac2c402 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/main.py @@ -0,0 +1,19 @@ +import sys +import os +import argparse + +sys.path.append(os.getcwd()) + +parser = argparse.ArgumentParser() +parser.add_argument("--experiment_type", type=str, default='atomic', + choices=["atomic", "conceptnet"]) +parser.add_argument("--experiment_num", type=str, default="0") + +args = parser.parse_args() + +if args.experiment_type == "atomic": + from main_atomic import main + main(args.experiment_num) +if args.experiment_type == "conceptnet": + from main_conceptnet import main + main(args.experiment_num) diff --git a/Model/COSMIC/feature_extraction/src/main_atomic.py b/Model/COSMIC/feature_extraction/src/main_atomic.py new file mode 100644 index 0000000000000000000000000000000000000000..cc6aacda3781a0b83ac9d1e51b3be4f0d3e14a35 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/main_atomic.py @@ -0,0 +1,125 @@ + +import random + +import torch + +import comet.src.train.atomic_train as train +import comet.src.models.models as models +import comet.src.data.data as data +import comet.utils.utils as utils +import comet.src.train.utils as train_utils +import comet.src.data.config as cfg + +from comet.src.data.utils import TextEncoder +from comet.src.train.opt import OpenAIAdam + + +def main(num): + # Generate configuration files depending on experiment being run + utils.generate_config_files("atomic", num) + + # Loads the correct configuration file + config_file = "config/atomic/config_{}.json".format(num) + + print(config_file) + + # Read config file to option + config = cfg.read_config(cfg.load_config(config_file)) + opt, meta = cfg.get_parameters(config) + + # Set the random seeds + torch.manual_seed(opt.train.static.seed) + random.seed(opt.train.static.seed) + if config.gpu_mode: + torch.cuda.manual_seed_all(opt.train.static.seed) + + # Where to find the data + splits = ["train", "dev", "test"] + + opt.train.dynamic.epoch = 0 + + print("Loading Data") + + categories = opt.data.categories + + path = "data/atomic/processed/{}/{}.pickle".format( + opt.exp, utils.make_name_string(opt.data)) + + data_loader = data.make_data_loader(opt, categories) + loaded = data_loader.load_data(path) + print(data_loader.sequences["train"]["total"].size(0)) + data_loader.opt = opt + data_loader.batch_size = opt.train.dynamic.bs + + print("Done.") + + # Initialize text_encoder + text_encoder = TextEncoder(config.encoder_path, config.bpe_path) + + special = [data.start_token, data.end_token] + special += ["<{}>".format(cat) for cat in categories] + special += [data.blank_token] + + text_encoder.encoder = data_loader.vocab_encoder + text_encoder.decoder = data_loader.vocab_decoder + + opt.data.maxe1 = data_loader.max_event + opt.data.maxe2 = data_loader.max_effect + opt.data.maxr = data.atomic_data.num_delimiter_tokens["category"] + + n_special = len(special) + n_ctx = opt.data.maxe1 + opt.data.maxe2 + n_vocab = len(text_encoder.encoder) + n_ctx + + print(data_loader.__dict__.keys()) + opt.net.vSize = n_vocab + + print("Building Model") + + model = models.make_model( + opt, n_vocab, n_ctx, n_special, + load=(opt.net.init=="pt")) + + print("Done.") + + print("Files will be logged at: {}".format( + utils.make_name(opt, prefix="results/losses/", + is_dir=True, eval_=True))) + + data_loader.reset_offsets("train") + + # Get number of examples + data.set_max_sizes(data_loader) + + if config.gpu_mode: + print("Pushing to GPU: {}".format(config.gpu_index)) + cfg.device = config.gpu_index + cfg.do_gpu = True + torch.cuda.set_device(cfg.device) + if config.multigpu: + model = models.multi_gpu( + model, config.gpu_indices).cuda() + else: + model.cuda(cfg.device) + print("Done.") + + print("Training") + + optimizer = OpenAIAdam(model.parameters(), + lr=opt.train.dynamic.lr, + schedule=opt.train.static.lrsched, + warmup=opt.train.static.lrwarm, + t_total=meta.iterations, + b1=opt.train.static.b1, + b2=opt.train.static.b2, + e=opt.train.static.e, + l2=opt.train.static.l2, + vector_l2=opt.train.static.vl2, + max_grad_norm=opt.train.static.clip) + + scorers = ["bleu", "rouge", "cider"] + trainer = train.make_trainer( + opt, meta, data_loader, model, optimizer) + trainer.set_evaluator(opt, model, data_loader) + + trainer.run() diff --git a/Model/COSMIC/feature_extraction/src/main_conceptnet.py b/Model/COSMIC/feature_extraction/src/main_conceptnet.py new file mode 100644 index 0000000000000000000000000000000000000000..baa252acaa90f719b9a631d0d51e0b4e3ecec0ed --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/main_conceptnet.py @@ -0,0 +1,138 @@ + +import random + +import torch + +import comet.src.train.conceptnet_train as train +import comet.src.models.models as models +import comet.src.data.data as data +import comet.utils.utils as utils +import comet.src.train.utils as train_utils +import comet.src.data.config as cfg + +from comet.src.data.utils import TextEncoder +from comet.src.train.opt import OpenAIAdam + + +def main(num): + # Generate configuration files depending on experiment being run + utils.generate_config_files("conceptnet", num) + + # Loads the correct configuration file + config_file = "config/conceptnet/config_{}.json".format(num) + + print(config_file) + + # Read config file to option + config = cfg.read_config(cfg.load_config(config_file)) + opt, meta = cfg.get_parameters(config) + + # config.gpu_mode = torch.cuda.is_available() + + # Set the random seeds + torch.manual_seed(opt.train.static.seed) + random.seed(opt.train.static.seed) + if config.gpu_mode: + torch.cuda.manual_seed_all(opt.train.static.seed) + + # Load the data + splits = ["train", "dev", "test"] + + opt.train.dynamic.epoch = 0 + + print("Loading Data") + + # Initialize path to pre-set data loader + path = "data/conceptnet/processed/{}/{}.pickle".format( + opt.exp, utils.make_name_string(opt.data)) + + # Make data loader + data_loader = data.make_data_loader(opt) + loaded = data_loader.load_data(path) + print(data_loader.sequences["train"]["total"].size(0)) + data_loader.opt = opt + data_loader.batch_size = opt.train.dynamic.bs + + print("Done.") + + text_encoder = TextEncoder(config.encoder_path, config.bpe_path) + + categories = data.conceptnet_data.conceptnet_relations + + special = [data.start_token, data.end_token] + special += ["<{}>".format(cat) for cat in categories] + + if loaded: + text_encoder.encoder = data_loader.vocab_encoder + text_encoder.decoder = data_loader.vocab_decoder + else: + for special_token in special: + text_encoder.decoder[len(encoder)] = special_token + text_encoder.encoder[special_token] = len(encoder) + data_loader.make_tensors(text_encoder, special) + + # Set max size of different parts of relation + context_size_e1 = data_loader.max_e1 + context_size_e2 = data_loader.max_e2 + context_size_r = data_loader.max_r + + opt.data.maxr = context_size_r + + n_special = len(special) + n_ctx = context_size_e1 + context_size_r + context_size_e2 + n_vocab = len(text_encoder.encoder) + n_ctx + + print(data_loader.__dict__.keys()) + opt.net.vSize = n_vocab + + # Build Model + print("Building Model") + + model = models.make_model( + opt, n_vocab, n_ctx, n_special, + load=(opt.net.init=="pt")) + + print("Done.") + + print("Files will be logged at: {}".format( + utils.make_name(opt, prefix="results/losses/", + is_dir=True, eval_=True))) + + data_loader.reset_offsets("train", keys=["total"]) + + data.set_max_sizes(data_loader) + + # Push to GPU + if config.gpu_mode: + print("Pushing to GPU: {}".format(config.gpu_index)) + cfg.device = config.gpu_index + cfg.do_gpu = True + torch.cuda.set_device(cfg.device) + if config.multigpu: + model = models.multi_gpu( + model, config.gpu_indices).cuda() + else: + model.cuda(cfg.device) + print("Done.") + + print("Training") + + optimizer = OpenAIAdam(model.parameters(), + lr=opt.train.dynamic.lr, + schedule=opt.train.static.lrsched, + warmup=opt.train.static.lrwarm, + t_total=meta.iterations, + b1=opt.train.static.b1, + b2=opt.train.static.b2, + e=opt.train.static.e, + l2=opt.train.static.l2, + vector_l2=opt.train.static.vl2, + max_grad_norm=opt.train.static.clip) + + trainer = train.make_trainer( + opt, meta, data_loader, model, optimizer) + print(data_loader.sequences["dev"]["total"].max()) + trainer.set_generator(opt, model, data_loader) + trainer.set_evaluator(opt, model, data_loader) + + trainer.run() diff --git a/Model/COSMIC/feature_extraction/src/models/gpt.py b/Model/COSMIC/feature_extraction/src/models/gpt.py new file mode 100644 index 0000000000000000000000000000000000000000..22f5eda7f130f27c80aed1ff72bd1d62aacfbdee --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/models/gpt.py @@ -0,0 +1,311 @@ +import copy +import json +import math +import re + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.parameter import Parameter + + +''' +Much of this code is taken from HuggingFace's OpenAI LM Implementation here: + +https://github.com/huggingface/pytorch-openai-transformer-lm +''' + + +def gelu(x): + return (0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * + (x + 0.044715 * torch.pow(x, 3))))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT_FNS = { + 'relu': nn.ReLU, + 'swish': swish, + 'gelu': gelu +} + + +class LayerNorm(nn.Module): + "Construct a layernorm module in the OpenAI style \ + (epsilon inside the square root)." + + def __init__(self, n_state, e=1e-5): + super(LayerNorm, self).__init__() + self.g = nn.Parameter(torch.ones(n_state)) + self.b = nn.Parameter(torch.zeros(n_state)) + self.e = e + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.e) + return self.g * x + self.b + + +class Conv1D(nn.Module): + def __init__(self, nf, rf, nx): + super(Conv1D, self).__init__() + self.rf = rf + self.nf = nf + if rf == 1: # faster 1x1 conv + w = torch.empty(nx, nf) + nn.init.normal_(w, std=0.02) + self.w = Parameter(w) + self.b = Parameter(torch.zeros(nf)) + else: # was used to train LM + raise NotImplementedError + + def forward(self, x): + if self.rf == 1: + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w) + x = x.view(*size_out) + else: + raise NotImplementedError + return x + + +class Attention(nn.Module): + def __init__(self, nx, n_ctx, cfg, scale=False): + super(Attention, self).__init__() + n_state = nx # in Attention: n_state=768 (nx=n_embd) + + assert n_state % cfg.nH == 0 + self.register_buffer('b', torch.tril(torch.ones( + n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.n_head = cfg.nH + self.split_size = n_state + self.scale = scale + self.c_attn = Conv1D(n_state * 3, 1, nx) + self.c_proj = Conv1D(n_state, 1, nx) + self.attn_dropout = nn.Dropout(cfg.adpt) + self.resid_dropout = nn.Dropout(cfg.rdpt) + + # dimensions of w: (batch_size x num_heads x seq_length x seq_length) + def _attn(self, q, k, v, sequence_mask): + w = torch.matmul(q, k) + if self.scale: + w = w / math.sqrt(v.size(-1)) + + b_subset = self.b[:, :, :w.size(-2), :w.size(-1)] + + if sequence_mask is not None: + b_subset = b_subset * sequence_mask.view( + sequence_mask.size(0), 1, -1) + b_subset = b_subset.permute(1, 0, 2, 3) + + w = w * b_subset + -1e9 * (1 - b_subset) + w = nn.Softmax(dim=-1)(w) + w = self.attn_dropout(w) + return torch.matmul(w, v) + + def merge_heads(self, x): + x = x.permute(0, 2, 1, 3).contiguous() + new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) + return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states + + def split_heads(self, x, k=False): + new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) + x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states + if k: + return x.permute(0, 2, 3, 1) + else: + return x.permute(0, 2, 1, 3) + + def forward(self, x, sequence_mask): + x = self.c_attn(x) + query, key, value = x.split(self.split_size, dim=2) + query = self.split_heads(query) + key = self.split_heads(key, k=True) + value = self.split_heads(value) + a = self._attn(query, key, value, sequence_mask) + a = self.merge_heads(a) + a = self.c_proj(a) + a = self.resid_dropout(a) + return a + + +class MLP(nn.Module): + def __init__(self, n_state, cfg): # in MLP: n_state=3072 (4 * n_embd) + super(MLP, self).__init__() + nx = cfg.hSize + self.c_fc = Conv1D(n_state, 1, nx) + self.c_proj = Conv1D(nx, 1, n_state) + self.act = ACT_FNS[cfg.afn] + self.dropout = nn.Dropout(cfg.rdpt) + + def forward(self, x): + h = self.act(self.c_fc(x)) + h2 = self.c_proj(h) + return self.dropout(h2) + + +class Block(nn.Module): + def __init__(self, n_ctx, cfg, scale=False): + super(Block, self).__init__() + nx = cfg.hSize + self.attn = Attention(nx, n_ctx, cfg, scale) + self.ln_1 = LayerNorm(nx) + self.mlp = MLP(4 * nx, cfg) + self.ln_2 = LayerNorm(nx) + + def forward(self, x, sequence_mask): + a = self.attn(x, sequence_mask) + n = self.ln_1(x + a) + m = self.mlp(n) + h = self.ln_2(n + m) + return h + + +class TransformerModel(nn.Module): + """ Transformer model """ + + def __init__(self, cfg, vocab=40990, n_ctx=512): + super(TransformerModel, self).__init__() + self.vocab = vocab + self.embed = nn.Embedding(vocab, cfg.hSize) + self.drop = nn.Dropout(cfg.edpt) + block = Block(n_ctx, cfg, scale=True) + self.h = nn.ModuleList([copy.deepcopy(block) + for _ in range(cfg.nL)]) + + nn.init.normal_(self.embed.weight, std=0.02) + + def forward(self, x, sequence_mask): + x = x.view(-1, x.size(-2), x.size(-1)) + e = self.embed(x) + # Add the position information to the input embeddings + h = e.sum(dim=2) + for block in self.h: + h = block(h, sequence_mask) + return h + + +class LMModel(nn.Module): + """ Transformer with language model head only """ + def __init__(self, cfg, vocab=40990, n_ctx=512, + return_probs=False, return_acts=False): + super(LMModel, self).__init__() + self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx) + self.lm_head = LMHead(self.transformer, cfg, trunc_and_reshape=False) + self.return_probs = return_probs + self.return_acts = return_acts + if self.return_probs or self.return_acts: + pos_emb_mask = torch.zeros(1, 1, vocab) + pos_emb_mask[:, :, -n_ctx:] = -1e12 + self.register_buffer('pos_emb_mask', pos_emb_mask) + + def forward(self, x, sequence_mask=None): + h = self.transformer(x, sequence_mask) + lm_logits = self.lm_head(h) + if self.return_probs: + lm_logits = F.softmax(lm_logits + self.pos_emb_mask, dim=-1) + elif self.return_acts: + lm_logits = lm_logits + self.pos_emb_mask + return h, lm_logits + + +class LMHead(nn.Module): + """ Language Model Head for the transformer """ + + def __init__(self, model, cfg, trunc_and_reshape=True): + super(LMHead, self).__init__() + self.n_embd = cfg.hSize + embed_shape = model.embed.weight.shape + self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False) + self.decoder.weight = model.embed.weight # Tied weights + self.trunc_and_reshape = trunc_and_reshape # XD + + def forward(self, h): + # Truncated Language modeling logits (we remove the last token) + h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd) \ + if self.trunc_and_reshape else h # XD + lm_logits = self.decoder(h_trunc) + return lm_logits + + +def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, + n_embd=768, path='./model/', path_names='./'): + # Load weights from TF model + print("Loading weights...") + names = json.load(open(path_names + 'parameters_names.json')) + shapes = json.load(open(path + 'params_shapes.json')) + offsets = np.cumsum([np.prod(shape) for shape in shapes]) + init_params = [np.load(path + 'params_{}.npy'.format(n)) for n in range(10)] + init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] + init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] + if n_ctx > 0: + init_params[0] = init_params[0][:n_ctx] + if n_special > 0: + init_params[0] = np.concatenate( + [init_params[1], + (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32), + init_params[0] + ], 0) + else: + init_params[0] = np.concatenate( + [init_params[1], + init_params[0] + ], 0) + del init_params[1] + if n_transfer == -1: + n_transfer = 0 + else: + n_transfer = 1 + n_transfer * 12 + init_params = [arr.squeeze() for arr in init_params] + + try: + assert model.embed.weight.shape == init_params[0].shape + except AssertionError as e: + e.args += (model.embed.weight.shape, init_params[0].shape) + raise + + model.embed.weight.data = torch.from_numpy(init_params[0]) + + for name, ip in zip(names[1:n_transfer], init_params[1:n_transfer]): + name = name[6:] # skip "model/" + assert name[-2:] == ":0" + name = name[:-2] + name = name.split('/') + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+\d+', m_name): + l = re.split(r'(\d+)', m_name) + else: + l = [m_name] + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + try: + assert pointer.shape == ip.shape + except AssertionError as e: + e.args += (pointer.shape, ip.shape) + raise + pointer.data = torch.from_numpy(ip) + + +class dotdict(dict): + """dot.notation access to dictionary attributes""" + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + +DEFAULT_CONFIG = dotdict({ + 'n_embd': 768, + 'n_head': 12, + 'n_layer': 12, + 'embd_pdrop': 0.1, + 'attn_pdrop': 0.1, + 'resid_pdrop': 0.1, + 'afn': 'gelu', + 'clf_pdrop': 0.1}) diff --git a/Model/COSMIC/feature_extraction/src/models/models.py b/Model/COSMIC/feature_extraction/src/models/models.py new file mode 100644 index 0000000000000000000000000000000000000000..89600130ef48664e1705b861930cfb4574c54a25 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/models/models.py @@ -0,0 +1,32 @@ +from comet.src.models.gpt import (LMModel, DEFAULT_CONFIG, load_openai_pretrained_model) +import torch.nn as nn + + +def make_model(opt, n_vocab, n_ctx, n_special, load=True, + return_acts=True, return_probs=False, + clf_token="", answer_size=None): + print(n_ctx) + if opt.exp == "generation": + model = LMModel( + opt.net, n_vocab, n_ctx, return_acts=return_acts, + return_probs=return_probs) + elif opt.exp == "classification": + model = ClfModel( + opt.net, n_vocab, n_ctx, clf_token, answer_size) + if load: + print("LOADING PRETRAINED TRANSFORMER") + load_openai_pretrained_model( + model.transformer, n_ctx=n_ctx, n_special=n_special) + return model + + +def multi_gpu(model, devices): + return nn.DataParallel(model, device_ids=devices) + + +def load_state_dict(model, state_dict): + try: + model.load_state_dict(state_dict) + except RuntimeError: + new_state_dict = {i[len("module."):]: j for i, j in state_dict.items()} + model.load_state_dict(new_state_dict) diff --git a/Model/COSMIC/feature_extraction/src/models/utils.py b/Model/COSMIC/feature_extraction/src/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d83ffe40d3fb814dcdfc379c645cf153087d7e96 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/models/utils.py @@ -0,0 +1,12 @@ +import torch + + +def prepare_position_embeddings(opt, encoder_vocab, sequences): + vocab_size = len(encoder_vocab) + num_positions = sequences.size(-2) + position_embeddings = torch.LongTensor( + range(vocab_size, vocab_size + num_positions)).to(sequences.device) + sequences = sequences.repeat(1, 1, 2) + sequences[:, :, 1] = position_embeddings + return sequences + diff --git a/Model/COSMIC/feature_extraction/src/train/atomic_train.py b/Model/COSMIC/feature_extraction/src/train/atomic_train.py new file mode 100644 index 0000000000000000000000000000000000000000..f1351774d42d2a1d55a06e4412f7c434c252c982 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/train/atomic_train.py @@ -0,0 +1,76 @@ +import random + +import comet.src.train.train as base_train +import comet.src.train.batch as batch +import comet.src.evaluate.atomic_evaluate as evaluate +# import comet.src.evaluate.atomic_generate as gen + + +def make_trainer(opt, *args): + return AtomicGenerationIteratorTrainer(opt, *args) + + +class AtomicGenerationIteratorTrainer(base_train.IteratorTrainer): + def __init__(self, opt, *args): + super(AtomicGenerationIteratorTrainer, self).__init__(opt, *args) + + self.initialize_losses(opt.data.get("categories", [])) + + def set_evaluator(self, opt, model, data_loader): + self.evaluator = evaluate.make_evaluator( + opt, model, data_loader) + + # def set_generator(self, opt, model, data_loader, scores, reward=None): + # self.generator = gen.make_generator( + # opt, model, data_loader, scores, reward) + + def set_sampler(self, opt): + if opt.train.static.samp not in self.samplers: + self.samplers[opt.train.static.samp] = sampling.make_sampler( + opt.train.static.samp, opt, self.data_loader, batch_mode=True) + self.batch_variables["sampler"] = self.samplers + + def batch(self, opt, *args): + outputs = batch.batch_atomic_generate(opt, *args) + + token_loss = outputs["loss"] + nums = outputs["nums"] + reset = outputs["reset"] + + return token_loss, nums, reset + + def initialize_losses(self, categories): + self.losses["train"] = { + "total_micro": [0], + "total_macro": [0] + } + + nums = {"total_micro": 0, "total_macro": 0} + + for category in categories: + micro_name = "{}_micro".format(category) + macro_name = "{}_macro".format(category) + + self.losses["train"][micro_name] = [0] + self.losses["train"][macro_name] = [0] + + nums[micro_name] = 0 + nums[macro_name] = 0 + + return nums + + def update_top_score(self, opt): + print(self.top_score) + if self.top_score is None: + self.top_score = (self.opt.train.dynamic.epoch, + self.get_tracked_score()) + elif self.get_tracked_score() < self.top_score[-1]: + self.top_score = (self.opt.train.dynamic.epoch, + self.get_tracked_score()) + print(self.top_score) + + def get_tracked_score(self): + return self.losses["dev"]["total_micro"][self.opt.train.dynamic.epoch] + + def counter(self, nums): + return nums["total_macro"] diff --git a/Model/COSMIC/feature_extraction/src/train/batch.py b/Model/COSMIC/feature_extraction/src/train/batch.py new file mode 100644 index 0000000000000000000000000000000000000000..4f90de737edc823148148c791a1dff383aa5428f --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/train/batch.py @@ -0,0 +1,135 @@ + +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comet.src.data.config as cfg +import comet.src.train.utils as train_utils +import comet.src.models.utils as model_utils +import comet.src.evaluate.utils as eval_utils +import comet.utils.utils as utils +from IPython import embed + + +############################################################################## +# BATCH +############################################################################## + + +def batch_atomic_generate(opt, nums, losses, batch_variables, eval_mode=False): + data_loader = batch_variables["data"] + model = batch_variables["model"] + split = batch_variables["split"] + + batch, reset = data_loader.sample_batch(split, bs=opt.train.dynamic.bs) + + input_ = model_utils.prepare_position_embeddings( + opt, data_loader.vocab_encoder, batch["sequences"].unsqueeze(-1)) + attention_mask = batch["attention_mask"] + loss_mask = batch["loss_mask"] + + targets = input_.squeeze(0)[:, 1:, 0].contiguous().view(-1) + + loss, dist = mle_steps( + opt.net.model, model, input_[:, :-1, :], targets, + attention_mask[:, :-1], loss_reduction="none") + + # Set loss name + micro_name = "total_micro" + macro_name = "total_macro" + + length = loss_mask.sum(1) + bs = input_.size(0) + + final_loss = (loss * loss_mask).sum(1) + + update_generation_losses(losses, nums, micro_name, macro_name, bs, + length, (loss * loss_mask).sum(1), split) + + final_loss = final_loss / length + + outputs = {"loss": final_loss.sum(), "nums": nums, "reset": reset} + + return outputs + + +def batch_conceptnet_generate(opt, nums, losses, batch_variables, + eval_mode=False, tracking_mode=False): + data_loader = batch_variables["data"] + model = batch_variables["model"] + split = batch_variables["split"] + category = batch_variables["category"] + + batch, reset = data_loader.sample_batch( + split, bs=opt.train.dynamic.bs, cat=category) + + input_ = model_utils.prepare_position_embeddings( + opt, data_loader.vocab_encoder, batch["sequences"].unsqueeze(-1)) + attention_mask = batch["attention_mask"] + loss_mask = batch["loss_mask"] + + targets = input_.squeeze(0)[:, 1:, 0].contiguous().view(-1) + + loss, dist = mle_steps( + opt.net.model, model, input_[:, :-1, :], targets, + attention_mask[:, :-1], loss_reduction="none") + + # Set loss name + if not eval_mode or batch_variables["category"] == "positive": + micro_name = "total_micro" + macro_name = "total_macro" + else: + micro_name = "negative_micro" + macro_name = "negative_macro" + + length = loss_mask.sum(1) + bs = input_.size(0) + + final_loss = (loss * loss_mask).sum(1) + + update_generation_losses(losses, nums, micro_name, macro_name, bs, + length, (loss * loss_mask).sum(1), split) + + final_loss = final_loss / length + + outputs = {"loss": final_loss.sum(), "nums": nums, "reset": reset} + + if tracking_mode: + outputs["tracking"] = final_loss.squeeze().tolist() + + return outputs + + +def mle_steps(key, model, input_, targets, attention_mask, + loss_reduction="mean", i=None): + word_acts = decode(model, input_.unsqueeze(1), + attention_mask, i) + + word_dist = train_utils.modify_output_for_loss_fn( + "nll", word_acts, dim=-1) + + # Compute losses + loss = F.nll_loss( + word_dist.view(-1, word_dist.size(-1)), + targets, reduction=loss_reduction) + + if loss_reduction != "mean": + return loss.view(word_dist.size(0), -1), word_dist + else: + return loss, word_dist + + +def decode(model, input_, attention_mask, i=None): + return model(input_, sequence_mask=attention_mask) + + +def update_generation_losses(losses, nums, micro, macro, bs, + length, loss, split): + if split == "train": + train_utils.update_generation_losses( + losses, nums, micro, macro, bs, length, loss) + else: + eval_utils.update_generation_losses( + losses, nums, micro, macro, bs, length, loss) diff --git a/Model/COSMIC/feature_extraction/src/train/conceptnet_train.py b/Model/COSMIC/feature_extraction/src/train/conceptnet_train.py new file mode 100644 index 0000000000000000000000000000000000000000..422dff8059572142e9b797b40c32b6f9e2b06414 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/train/conceptnet_train.py @@ -0,0 +1,67 @@ +import random +import torch + +import comet.src.data.config as cfg + +import comet.src.train.atomic_train as base_train +import comet.src.train.batch as batch_utils +import comet.src.evaluate.conceptnet_evaluate as evaluate +import comet.src.evaluate.conceptnet_generate as gen + + +def make_trainer(opt, *args): + return ConceptNetGenerationIteratorTrainer(opt, *args) + + +class ConceptNetGenerationIteratorTrainer( + base_train.AtomicGenerationIteratorTrainer): + def set_evaluator(self, opt, model, data_loader): + self.evaluator = evaluate.make_evaluator( + opt, model, data_loader) + + def set_generator(self, opt, model, data_loader): + self.generator = gen.make_generator( + opt, model, data_loader) + + def batch(self, opt, *args): + outputs = batch_utils.batch_atomic_generate(opt, *args) + + token_loss = outputs["loss"] + nums = outputs["nums"] + reset = outputs["reset"] + + return token_loss, nums, reset + + def update_top_score(self, opt): + print(self.top_score) + + tracked_scores = self.get_tracked_score() + + if self.top_score is None: + self.top_score = \ + self.top_score = {"epoch": {}, "score": {}} + self.top_score["epoch"]["total_micro"] = self.opt.train.dynamic.epoch + self.top_score["score"]["total_micro"] = tracked_scores["total_micro"] + else: + if tracked_scores["total_micro"] < self.top_score["score"]["total_micro"]: + self.top_score["epoch"]["total_micro"] = self.opt.train.dynamic.epoch + self.top_score["score"]["total_micro"] = tracked_scores["total_micro"] + + print(self.top_score) + + def get_tracked_score(self): + return { + "total_micro": self.losses["dev"]["total_micro"][self.opt.train.dynamic.epoch] + } + + def decide_to_save(self): + to_save = cfg.save and not cfg.toy + + curr_epoch = self.opt.train.dynamic.epoch + + to_save = to_save or cfg.test_save + print(cfg.save_strategy) + if cfg.save_strategy == "best": + if ((self.top_score["epoch"]["total_micro"] != curr_epoch)): + to_save = False + return to_save diff --git a/Model/COSMIC/feature_extraction/src/train/opt.py b/Model/COSMIC/feature_extraction/src/train/opt.py new file mode 100644 index 0000000000000000000000000000000000000000..ef8b9666a41a52f775dfa38797648aa0553af2e0 --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/train/opt.py @@ -0,0 +1,122 @@ +'''TAKEN from OpenAI LM Code by HuggingFace''' + +import math +import torch +from torch.optim import Optimizer +from torch.nn.utils import clip_grad_norm_ + + +def warmup_cosine(x, warmup=0.002): + s = 1 if x <= warmup else 0 + return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x))) + + +def warmup_constant(x, warmup=0.002): + s = 1 if x <= warmup else 0 + return s*(x/warmup) + (1-s)*1 + + +def warmup_linear(x, warmup=0.002): + s = 1 if x <= warmup else 0 + + # print(s) + + return (s*(x/warmup) + (1-s))*(1-x) + + +SCHEDULES = { + 'warmup_cosine': warmup_cosine, + 'warmup_constant': warmup_constant, + 'warmup_linear': warmup_linear, +} + + +class OpenAIAdam(Optimizer): + """Implements Open AI version of Adam algorithm with weight decay fix. + """ + def __init__(self, params, lr, schedule, warmup, t_total, + b1=0.9, b2=0.999, e=1e-8, l2=0, + vector_l2=False, max_grad_norm=-1, **kwargs): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0 <= warmup: + raise ValueError("Invalid warmup: {}".format(warmup)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {}".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {}".format(b2)) + if not 0.0 <= e: + raise ValueError("Invalid epsilon value: {}".format(e)) + defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, + b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2, + max_grad_norm=max_grad_norm) + super(OpenAIAdam, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + # print(group['t_total']) + # print(group['warmup']) + # if self.state[group['params'][0]]: + # print(self.state[group['params'][0]]['step'] / group['t_total']) + # print() + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError( + 'Adam does not support sparse gradients, \ + please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['b1'], group['b2'] + + state['step'] += 1 + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + denom = exp_avg_sq.sqrt().add_(group['e']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + schedule_fct = SCHEDULES[group['schedule']] + lr_scheduled = (group['lr'] * schedule_fct(state['step'] / + group['t_total'], group['warmup'])) + step_size = (lr_scheduled * math.sqrt(bias_correction2) / + bias_correction1) + + p.data.addcdiv_(-step_size, exp_avg, denom) + + # Add weight decay at the end (fixed version) + if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0: + p.data.add_(-lr_scheduled * group['l2'], p.data) + + return loss diff --git a/Model/COSMIC/feature_extraction/src/train/train.py b/Model/COSMIC/feature_extraction/src/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..7b050bd0d39ab0d72adfffbda72bb73f2eae247a --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/train/train.py @@ -0,0 +1,233 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comet.src.data.config as cfg +import comet.src.data.data as data +import comet.src.train.utils as train_utils +import comet.src.train.batch as batch + +import comet.src.evaluate.evaluate as evaluate +import comet.src.evaluate.generate as gen +import comet.src.evaluate.sampler as sampling + +import comet.utils.utils as utils + +from tensorboardX import SummaryWriter + + +class Trainer(object): + def __init__(self, opt, meta, data_loader, model, optimizer): + self.optimizer = optimizer + + self.model = model + + if opt.trainer == "epoch": + self.epochs = meta.epochs + self.data_loader = data_loader + self.opt = opt + + self.losses = {"dev": {}, "test": {}, "train": {}} + + self.top_score = None + + self.lrs = {} + + self.batch_variables = { + "data": self.data_loader, + "model": self.model, + "split": "train" + } + + self.do_gen = cfg.do_gen + self.samplers = {} + + def decide_to_save(self): + to_save = cfg.save and not cfg.toy + + to_save = to_save or cfg.test_save + print(cfg.save_strategy) + if cfg.save_strategy == "best": + if self.top_score[0] != self.opt.train.dynamic.epoch: + print("DOING IT RIGHT") + to_save = False + return to_save + + def save_model(self, tracked_score): + lrs = {} + for i, param_group in enumerate(self.optimizer.param_groups): + lrs[i] = param_group['lr'] + self.lrs[self.opt.train.dynamic.epoch] = lrs + + to_save = self.decide_to_save() + + if to_save: + data.save_step( + self.model, self.data_loader.vocab_encoder, + self.optimizer, self.opt, + self.opt.train.dynamic.epoch, self.lrs) + + def log_losses(self, opt, losses): + if (not cfg.toy and cfg.save) or cfg.test_save: + data.save_eval_file(opt, losses["train"], "losses", split="train") + data.save_eval_file(opt, losses['dev'], "losses", split="dev") + data.save_eval_file(opt, losses['test'], "losses", split="test") + + def set_logger(self): + if cfg.toy: + self.logger = SummaryWriter(utils.make_name( + self.opt, prefix="garbage/logs/", eval_=True, do_epoch=False)) + else: + self.logger = SummaryWriter(utils.make_name( + self.opt, prefix="logs/", eval_=True, do_epoch=False)) + print("Logging Tensorboard Files at: {}".format(self.logger.logdir)) + + def stop_logger(self): + self.logger.close() + + def run(self): + self.set_logger() + self.count = 0 + for epoch in range(self.epochs): + self.model.train() + self.opt.train.dynamic.epoch += 1 + self.epoch() + + self.stop_logger() + + def epoch(self): + nums = self.reset_losses() + + # Initialize progress bar + bar = utils.initialize_progress_bar( + self.data_loader.sequences["train"]) + + reset = False + + while not reset: + loss, nums, reset = self.do_forward_pass(nums) + self.do_backward_pass(loss) + self.update_parameters() + + bar.update(self.opt.train.dynamic.bs) + self.count += 1 + + for loss_name in self.losses["train"]: + self.logger.add_scalar( + "train/{}".format(loss_name), + loss.item() / self.opt.train.dynamic.bs, + self.count) + + if cfg.toy and self.counter(nums) > 300: + break + + with torch.no_grad(): + self.run_evaluation_cycle() + + self.log_losses(self.opt, self.losses) + self.update_top_score(self.opt) + self.save_model(self.get_tracked_score()) + + self.data_loader.reset_offsets("train") + + def run_evaluation_cycle(self): + for split in ["dev", "test"]: + self.evaluator.validate( + self.opt.train.dynamic.epoch, split, + self.losses[split]) + + if self.do_gen: + gen.do_gen_run( + self.opt, self.generator, self.opt.train.dynamic.epoch, + split, self.losses[split]) + iter_num = self.opt.train.dynamic.epoch + + for loss_name in self.losses[split]: + self.logger.add_scalar( + "{}/{}".format(split, loss_name), + self.losses[split][loss_name][iter_num], + iter_num) + + def clip_gradients(self): + if self.opt.train.static.clip: + torch.nn.utils.clip_grad_norm_( + self.model.parameters(), self.opt.train.static.clip) + + def do_forward_pass(self, nums): + token_loss, nums, reset = self.batch( + self.opt, nums, self.losses["train"], + self.batch_variables) + return token_loss, nums, reset + + def do_backward_pass(self, loss): + loss.backward() + + def update_parameters(self): + if self.opt.model == "lstm": + self.clip_gradients() + self.optimizer.step() + self.optimizer.zero_grad() + + def reset_losses(self): + loss_names = set([i.rstrip("maicro").rstrip("_") for + i in self.losses["train"].keys()]) + return self.initialize_losses(list(loss_names)) + + +class IteratorTrainer(Trainer): + def __init__(self, opt, meta, data_loader, model, optimizer): + super(IteratorTrainer, self).__init__( + opt, meta, data_loader, model, optimizer) + + self.iters = meta.cycle + self.total_iters = meta.iterations + + def run(self): + self.set_logger() + + # Initialize progress bar + bar = utils.set_progress_bar(self.total_iters) + + for cycle_num in range(int(self.total_iters / self.iters)): + self.model.train() + + self.cycle(bar, cycle_num) + + with torch.no_grad(): + self.run_evaluation_cycle() + + self.log_losses(self.opt, self.losses) + self.update_top_score(self.opt) + self.save_model(self.get_tracked_score()) + + self.stop_logger() + + def cycle(self, bar, cycle_num): + nums = self.reset_losses() + print(self.losses["train"]) + + for i in range(1, self.iters + 1): + # self.model.zero_grad() + + loss, nums, reset = self.do_forward_pass(nums) + self.do_backward_pass(loss) + + self.update_parameters() + # print(loss) + # print(loss.item()) + self.opt.train.dynamic.epoch += 1 + + for loss_name in self.losses["train"]: + self.logger.add_scalar( + "train/{}".format(loss_name), + loss.item() / self.opt.train.dynamic.bs, + self.opt.train.dynamic.epoch) + + bar.update(1) + + if cfg.toy and i > 10: + break + + if reset: + self.data_loader.reset_offsets("train") + diff --git a/Model/COSMIC/feature_extraction/src/train/utils.py b/Model/COSMIC/feature_extraction/src/train/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1fc4780c4391f8dcd0d78a8b633a6ae51fec1a1f --- /dev/null +++ b/Model/COSMIC/feature_extraction/src/train/utils.py @@ -0,0 +1,58 @@ +import torch +import torch.optim +import torch.nn.functional as F + +import copy + + +def update_generation_losses(losses, nums, micro, macro, bs, length, loss): + # Update Losses + losses[micro] += \ + [copy.deepcopy(losses[micro][-1])] + losses[macro] += \ + [copy.deepcopy(losses[macro][-1])] + + losses[micro][-1] *= nums[micro] + losses[macro][-1] *= nums[macro] + + nums[macro] += bs + + if isinstance(length, int): + update_indiv_generation_losses( + losses, nums, micro, macro, bs, length, loss) + else: + update_tensor_generation_losses( + losses, nums, micro, macro, bs, length, loss) + + +def update_indiv_generation_losses(losses, nums, micro, + macro, bs, length, loss): + nums[micro] += (bs * length) + + batch_loss = loss * bs + + losses[micro][-1] += batch_loss + losses[micro][-1] /= nums[micro] + losses[macro][-1] += batch_loss / length + losses[macro][-1] /= nums[macro] + + +def update_tensor_generation_losses(losses, nums, micro, + macro, bs, length, loss): + nums[micro] += length.sum().item() + + losses[micro][-1] += loss.sum().item() + losses[micro][-1] /= nums[micro] + losses[macro][-1] += (loss / length.float()).sum().item() + losses[macro][-1] /= nums[macro] + + +def modify_output_for_loss_fn(loss_fn, output, dim): + if loss_fn == "ce": + return output + if loss_fn == "mse": + return F.softmax(output, dim=dim) + if loss_fn == "nll": + return F.log_softmax(output, dim=dim) + if loss_fn in ["bce", "wbce", "wbce1"]: + return torch.sigmoid(output) diff --git a/Model/COSMIC/feature_extraction/utils/__init__.py b/Model/COSMIC/feature_extraction/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Model/COSMIC/feature_extraction/utils/utils.py b/Model/COSMIC/feature_extraction/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..459c80a583446f45f8a4caf91dadc3cc39e2ba8b --- /dev/null +++ b/Model/COSMIC/feature_extraction/utils/utils.py @@ -0,0 +1,210 @@ +import json +import copy + +import torch + +import numpy as np +import contextlib + +from distutils.dir_util import mkpath + +from tqdm import tqdm + + +def make_new_tensor_from_list(items, device_num, dtype=torch.float32): + if device_num is not None: + device = torch.device("cuda:{}".format(device_num)) + else: + device = torch.device("cpu") + return torch.tensor(items, dtype=dtype, device=device) + + +# is_dir look ast at whether the name we make +# should be a directory or a filename +def make_name(opt, prefix="", eval_=False, is_dir=True, set_epoch=None, + do_epoch=True): + string = prefix + string += "{}-{}".format(opt.dataset, opt.exp) + string += "/" + string += "{}-{}-{}".format(opt.trainer, opt.cycle, opt.iters) + string += "/" + string += opt.model + if opt.mle: + string += "-{}".format(opt.mle) + string += "/" + string += make_name_string(opt.data) + "/" + + string += make_name_string(opt.net) + "/" + string += make_name_string(opt.train.static) + "/" + + if eval_: + string += make_name_string(opt.eval) + "/" + # mkpath caches whether a directory has been created + # In IPython, this can be a problem if the kernel is + # not reset after a dir is deleted. Trying to recreate + # that dir will be a problem because mkpath will think + # the directory already exists + if not is_dir: + mkpath(string) + string += make_name_string( + opt.train.dynamic, True, do_epoch, set_epoch) + if is_dir: + mkpath(string) + + return string + + +def make_name_string(dict_, final=False, do_epoch=False, set_epoch=None): + if final: + if not do_epoch: + string = "{}_{}_{}".format( + dict_.lr, dict_.optim, dict_.bs) + elif set_epoch is not None: + string = "{}_{}_{}_{}".format( + dict_.lr, dict_.optim, dict_.bs, set_epoch) + else: + string = "{}_{}_{}_{}".format( + dict_.lr, dict_.optim, dict_.bs, dict_.epoch) + + return string + + string = "" + + for k, v in dict_.items(): + if type(v) == DD: + continue + if isinstance(v, list): + val = "#".join(is_bool(str(vv)) for vv in v) + else: + val = is_bool(v) + if string: + string += "-" + string += "{}_{}".format(k, val) + + return string + + +def is_bool(v): + if str(v) == "False": + return "F" + elif str(v) == "True": + return "T" + return v + + +def generate_config_files(type_, key, name="base", eval_mode=False): + with open("config/default.json".format(type_), "r") as f: + base_config = json.load(f) + with open("config/{}/default.json".format(type_), "r") as f: + base_config_2 = json.load(f) + if eval_mode: + with open("config/{}/eval_changes.json".format(type_), "r") as f: + changes_by_machine = json.load(f) + else: + with open("config/{}/changes.json".format(type_), "r") as f: + changes_by_machine = json.load(f) + + base_config.update(base_config_2) + + if name in changes_by_machine: + changes = changes_by_machine[name] + else: + changes = changes_by_machine["base"] + + # for param in changes[key]: + # base_config[param] = changes[key][param] + + replace_params(base_config, changes[key]) + + mkpath("config/{}".format(type_)) + + with open("config/{}/config_{}.json".format(type_, key), "w") as f: + json.dump(base_config, f, indent=4) + + +def replace_params(base_config, changes): + for param, value in changes.items(): + if isinstance(value, dict) and param in base_config: + replace_params(base_config[param], changes[param]) + else: + base_config[param] = value + + +def initialize_progress_bar(data_loader_list): + num_examples = sum([len(tensor) for tensor in + data_loader_list.values()]) + return set_progress_bar(num_examples) + + +def set_progress_bar(num_examples): + bar = tqdm(total=num_examples) + bar.update(0) + return bar + + +def merge_list_of_dicts(L): + result = {} + for d in L: + result.update(d) + return result + + +def return_iterator_by_type(data_type): + if isinstance(data_type, dict): + iterator = data_type.items() + else: + iterator = enumerate(data_type) + return iterator + + +@contextlib.contextmanager +def temp_seed(seed): + state = np.random.get_state() + np.random.seed(seed) + try: + yield + finally: + np.random.set_state(state) + + +def flatten(outer): + return [el for inner in outer for el in inner] + + +def zipped_flatten(outer): + return [(key, fill, el) for key, fill, inner in outer for el in inner] + + +def remove_none(l): + return [e for e in l if e is not None] + + +# Taken from Jobman 0.1 +class DD(dict): + def __getattr__(self, attr): + if attr == '__getstate__': + return super(DD, self).__getstate__ + elif attr == '__setstate__': + return super(DD, self).__setstate__ + elif attr == '__slots__': + return super(DD, self).__slots__ + return self[attr] + + def __setattr__(self, attr, value): + # Safety check to ensure consistent behavior with __getattr__. + assert attr not in ('__getstate__', '__setstate__', '__slots__') +# if attr.startswith('__'): +# return super(DD, self).__setattr__(attr, value) + self[attr] = value + + def __str__(self): + return 'DD%s' % dict(self) + + def __repr__(self): + return str(self) + + def __deepcopy__(self, memo): + z = DD() + for k, kv in self.items(): + z[k] = copy.deepcopy(kv, memo) + return z diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c58963839384410281e3d071049c70a95e37314b --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,29 @@ +from fairseq.models.roberta import RobertaModel + +import sys + +PATH_TO_COSMIC = "../Model/COSMIC" +EXTRACTORS_PATH = PATH_TO_COSMIC + "/feature_extraction" +EPIK_MODEL_DIR = PATH_TO_COSMIC + "/erc_training" + +sys.path.append(PATH_TO_COSMIC) +sys.path.append(EXTRACTORS_PATH) +sys.path.append(EPIK_MODEL_DIR) +from Model.COSMIC.feature_extraction.comet.csk_feature_extract import ( + CSKFeatureExtractor, +) + +from Model.COSMIC.erc_training.predict_epik import parse_cosmic_args, load_model + +roberta = RobertaModel.from_pretrained( + EXTRACTORS_PATH + "/checkpoints/epik/", + checkpoint_file="checkpoint_best.pt", + data_name_or_path="../../epik-bin", +) +roberta.eval() + +comet = CSKFeatureExtractor(dir=EXTRACTORS_PATH) + +cosmic_args = parse_cosmic_args() + +COSMIC_MODEL = load_model(EPIK_MODEL_DIR + "/epik/best_model.pt", cosmic_args) diff --git a/app/app.py b/app/app.py new file mode 100644 index 0000000000000000000000000000000000000000..60b94b6cd31f87f1879da63bdc93618b124b6442 --- /dev/null +++ b/app/app.py @@ -0,0 +1,13 @@ +# +# to run app, cd to app directory and do: +# python ./app.py --active-listener --class-weight --residual +# + +import gradio as gr +from views import cosmic_view + +cosmic_model = cosmic_view.cosmic_ui() +demo = gr.TabbedInterface([cosmic_model], ["COSMIC"]) + +if __name__ == "__main__": + demo.launch() diff --git a/app/views/cosmic_view.py b/app/views/cosmic_view.py new file mode 100644 index 0000000000000000000000000000000000000000..c140bea594d6fc777782a5804991a9019cbb5c02 --- /dev/null +++ b/app/views/cosmic_view.py @@ -0,0 +1,221 @@ +import os +import pickle +import tempfile +import gradio as gr +from tqdm import tqdm +from views.utils import ( + create_input_instruction, + format_prediction_ouptut, + remove_temp_dir, + EXAMPLE_CONVERSATIONS, +) +from fairseq.data.data_utils import collate_tokens + +import sys + +sys.path.insert(0, "../") # neccesary to load modules outside of app + +from app import roberta, comet, COSMIC_MODEL, cosmic_args +from preprocessing import preprocess +from Model.COSMIC.erc_training.predict_epik import predict, get_valid_dataloader + + +def cosmic_preprocess(input, dir="."): + result = preprocess.process_user_input(input) + + if not result["success"]: + raise gr.Error(result["message"]) + + data = result["data"] + + # processed the data and turn it into a csv file + output_csv_path = os.path.join(dir, "epik.csv") + grouped_df = preprocess.preapre_csv(data, output_csv_path, with_label=False) + + # convert the csv to pickle file of speakers, labels, sentences + pickle_dest = os.path.join(dir, "epik.pkl") + preprocess.convert_to_pickle( + source=output_csv_path, + dest=pickle_dest, + index_col="ConversationId", + list_type_columns=[ + "Text", + "ParticipantRoleEncoded", + "LabelNumeric", + ], + order=[ + "ParticipantRoleEncoded", + "LabelNumeric", + "Text", + ], + exclude=["ParticipantRole"], + ) + + # split the id for prediction, we'll put these in validation ids + preprocess.split_and_save_ids( + grouped_df["ConversationId"].to_list(), 0, 0, 1, dir=dir + ) + + # add ids into the pickle files + preprocess.merge_pkl_with_ids( + pickle_src=pickle_dest, + ids_files=["train_set.txt", "test_set.txt", "validation_set.txt"], + dir=dir, + ) + + # generate the sentences pickle file + sentences_pkl_path = os.path.join(dir, "epik_sentences.pkl") + preprocess.convert_to_pickle( + source=output_csv_path, + dest=sentences_pkl_path, + index_col="ConversationId", + list_type_columns=["Text"], + exclude=[ + "ParticipantRole", + "ParticipantRoleEncoded", + "LabelNumeric", + ], + ) + + return pickle_dest, sentences_pkl_path + + +def cosmic_roberta_extract(path, dest_dir="."): + # load the feature from file at path + speakers, labels, sentences, train_ids, test_ids, valid_ids = pickle.load( + open(path, "rb") + ) + roberta1, roberta2, roberta3, roberta4 = {}, {}, {}, {} + + all_ids = train_ids + test_ids + valid_ids + + for i in tqdm(range(len(all_ids))): + item = all_ids[i] + sent = sentences[item] + sent = [s.encode("ascii", errors="ignore").decode("utf-8") for s in sent] + batch = collate_tokens([roberta.encode(s) for s in sent], pad_idx=1) + feat = roberta.extract_features(batch, return_all_hiddens=True) + roberta1[item] = [row for row in feat[-1][:, 0, :].detach().numpy()] + roberta2[item] = [row for row in feat[-2][:, 0, :].detach().numpy()] + roberta3[item] = [row for row in feat[-3][:, 0, :].detach().numpy()] + roberta4[item] = [row for row in feat[-4][:, 0, :].detach().numpy()] + + roberta_feature_path = os.path.join(dest_dir, "epik_features_roberta.pkl") + pickle.dump( + [ + speakers, + labels, + roberta1, + roberta2, + roberta3, + roberta4, + sentences, + train_ids, + test_ids, + valid_ids, + ], + open(roberta_feature_path, "wb"), + ) + + return roberta_feature_path + + +def cosmic_comet_extract(path, dir="."): + print("Extracting features in", path) + sentences = pickle.load(open(path, "rb")) + feaures = comet.extract(sentences) + + comet_feature_path = os.path.join(dir, "epik_features_comet.pkl") + pickle.dump(feaures, open(comet_feature_path, "wb")) + + return comet_feature_path + + +def cosmic_classifier(input): + # create a temporary directory for the input data + temp_dir = tempfile.mkdtemp(dir=os.getcwd(), prefix="temp") + + epik_path, epik_sentences_path = cosmic_preprocess(input, temp_dir) + + roberta_path = cosmic_roberta_extract(epik_path, temp_dir) + comet_path = cosmic_comet_extract(epik_sentences_path, temp_dir) + + # use cosmic model to make predictions + data_loader, ids = get_valid_dataloader(roberta_path, comet_path) + predictions = predict(COSMIC_MODEL, data_loader, cosmic_args) + + speakers, _, sentences, _, _, valid_ids = pickle.load(open(epik_path, "rb")) + + # Assuming that there's only one conversation + conv_id = ids[0] + output = format_prediction_ouptut( + speakers[conv_id], sentences[conv_id], predictions[0] + ) + + print() + print("======= Removing Temporary Directory =======") + remove_temp_dir(temp_dir) + return output + + +def cosmic_ui(): + with gr.Blocks() as cosmic_model: + gr.Markdown( + """ + # COSMIC + COSMIC is a popular model for predicting sentiment labels using the entire + context of the conversation. In other words, it analyzes the previous + messages to predict the sentiment label for the current message.
+ The model was adopted from this + [repo](https://github.com/declare-lab/conv-emotion.git), implemented based + on this research [paper](https://arxiv.org/pdf/2010.02795.pdf). + + ```bash COSMIC: COmmonSense knowledge for eMotion Identification in + Conversations. D. Ghosal, N. Majumder, A. Gelbukh, R. Mihalcea, & S. Poria. Findings of EMNLP 2020. + ``` + """ + ) + + create_input_instruction() + with gr.Row(): + with gr.Column(): + example_dropdown = gr.Dropdown( + choices=["-- Not Selected --"] + list(EXAMPLE_CONVERSATIONS.keys()), + value="-- Not Selected --", + label="Select an example", + ) + + gr.Markdown('

--- OR ---

') + + conversation_input = gr.TextArea( + value="", + label="Input you conversation", + placeholder="Plese input your conversation here.\n\n\n\nMaximum number of lines: 200", + lines=5, + max_lines=200, + ) + + def on_example_change(input): + if input in EXAMPLE_CONVERSATIONS: + return EXAMPLE_CONVERSATIONS[input] + + return "" + + example_dropdown.input( + on_example_change, + inputs=example_dropdown, + outputs=conversation_input, + ) + + submit_btn = gr.Button(value="Submit") + + with gr.Column(): + gr.Markdown( + '



Predicted Sentiment Labels for the Conversation


' + ) + output = gr.Markdown(value="", label="Output") + + submit_btn.click(cosmic_classifier, conversation_input, output) + + conversation_input.change(lambda x: "", conversation_input, output) + return cosmic_model diff --git a/app/views/utils.py b/app/views/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..17835ef22842e1b7abd800835dc92f4fb813eba8 --- /dev/null +++ b/app/views/utils.py @@ -0,0 +1,63 @@ +import gradio as gr +import os +import sys + +sys.path.append("..") + +from preprocessing.preprocess import decode_numeric_label, decode_speaker_role + + +def create_input_instruction(): + gr.Markdown( + """ + # Instructions + Get started by inputting the conversation into the text box below. The + conversation should be in the format of `:` where speaker is + either Visitor or Agent. There should only be one message per line. + + For example:
+     Visitor:Hi! How are you?
+     Visitor:Are you there?
+     Agent:Hi! I'm good. And you? + + You can also choose from one of the examples below to get a quickstart. + """ + ) + + +def format_prediction_ouptut(numeric_roles, messages, numeric_labels): + output = "" + + for numeric_role, message, numeric_label in zip( + numeric_roles, messages, numeric_labels + ): + label = decode_numeric_label(numeric_label) + role = decode_speaker_role(numeric_role) + + output += f"[{label}] {role}:{message}\n\n" + + return output + + +def remove_temp_dir(temp_dir): + # List all files in the temporary directory + files = os.listdir(temp_dir) + + # Remove each file within the directory + for file in files: + file_path = os.path.join(temp_dir, file) + if os.path.isfile(file_path): + os.remove(file_path) + print(f"=== File '{file}' has been removed.") + else: + remove_temp_dir(file_path) + print(f"=== Directory '{file}' has been removed.") + + os.rmdir(temp_dir) + + +EXAMPLE_CONVERSATIONS = { + "Example 1": "Visitor:Hey Jess, available for outcall tonight?\n\nAgent:hey hey what you looking for luv?\n\nVisitor:Some company in El Segundo\n\nAgent:out with friends or just the two of us?\n\nVisitor:Just the two\n\nAgent:looking for some connection then...\n\nVisitor:Yes\n\nAgent:I get it. Looking for companionship, it's a very legit need. I'll level with you. We're a group of guys trying to show other men the harms of the sex trade industry, both to buyers and sellers. You up for a little conversation? No pressure... no judgement.\n\nVisitor:No, but interested in the harms in companionship?\n\nAgent:The truth is nearly all of the women on these sites are not in this career by choice. We work with survivors. Their money goes to pimps.. their lives are controlled.\n\nVisitor:Really?\n\nVisitor:Even when they say \"independent\"?\n\nAgent:Yes. It's all part of the human trafficking system... pimps find & recruit nearly all women who are doing this, they are very, very good at manipulation and control, and working in the background. Sadly, this modern day slavery will be with us as long as there are willing buyers. That's why we are trying to work this angle. The more men that know the truth, the fewer will want to exploit someone in this way.\n\nAgent:I need to move on, but just consider what I've told you tonight. Polarisproject.org is a great resource to learn more... it's nothing like what Hollywood shows ya. We work with the women & some guys who have escaped out of this lifestyle. It is modern-day slavery, straight up. Thanks for the chat. Good night!\n\nVisitor:Thanks, fyi it worked. Hung it up, I'm to old to me messin around in this crazy stuff take care\n\nAgent:I am so glad to hear this friend. You calling this number tonight was not just a coincidence. Take care also!!\n\n", + "Example 2": "Visitor:What's up sexy\n\nAgent:Thank you! I'm hangin\n\nVisitor:Are you available at the moment\n\nAgent:what's your name\n\nVisitor:David ... Yours\n\nAgent:Dakota is not available right now, but my name is Jeff and I'm working with a bunch of guys trying to curb the demand for victims of human trafficking. Did you ever consider that Dakota could be a victim of trafficking?\n\nVisitor:Fuck no I just wanted to get some pussy .... Bye\n\nAgent:Just so you know your number is in our system now. Understand that next time you click on one of these ads you could be walking into an undercover cop and many of these girls are underage. That is a crime that will follow you the rest of your life!\n\nVisitor:. You think the predators your looking for going to fall into your lap? Lol your not a cop stop trying to be one ... You want to help?\n\nVisitor:Just so you know I don't give no fucks about you or no fucking cops... Threatening me will not get you shit from me ,no info no nothing you stupid cunt..\n\nAgent:Hey man, I did not want to threaten you. I want you to be aware of the risks. Have you heard that most women in prostitution do it against their will, and many are minors?\n\nVisitor:I never fuck with minors that's for sure cause you can tell if they are not of age\n\nAgent:Yeah, many people think it's harmless and these women are in complete control. But, the truth is, most of these women are enslaved, forced to sell themselves by pimps who take 100% of their money. I hope you can see these women in a different light.\n\nVisitor:I know most are blacks living off a woman cause they can't get no job ... I won't see a female if she has a guy around\n\nAgent:I'm curious, is there something missing inside of you, or are you angry about something not right in your life that causes you to seek out connection with these girls?\n\nVisitor:No actually I am on gps monitor and can't go out and meet no one so I got needs and feel sometimes hooking up with one of these girls here and there is w\n\nVisitor:hat works for my situation....but I'm not disrespectful to them or be doing ugly things ... My ex wife cheated on me after 8yrs of marriage I don't reall\n\nVisitor:y trust women like enough for a relationship\n\nVisitor:I'm being honest with you cause even tho I don't know u I believe in doing the right thing and your movement with alot of work is going to save lives so\n\nVisitor:that's good\n\nAgent:Hey David, I'm very sorry for your situation and the pain you must be in. Were you raised with any faith perspective?\n\nVisitor:Yes I actually was baptized for the first time in my life in 2018 and have made some drastic changes to my life.... I'm not perfect but I am working hard\n\nAgent:Okay, David. That's great news. We're all a work in progress. God is faithful to forgive us of our sins if we just ask him. Read 1 John 1:9. Can I direct you to some resources to help you overcome this urge to buy sex?\n\nVisitor:Honestly I'm not ready for that is all I can say at the moment.... Sorry but I don't like to lie\n\nAgent:Hey, no worries. When you're ready, check out www.celebraterecovery.com as they have a lot of resources that might help. I will be praying for you, David. I've had a lot of pain in my life and only in my relationship with Jesus have I found true peace that surpasses my understanding. I pray that for you too.\n\nVisitor:What kinda pain ? Do you mind sharing?\n\nVisitor:And what are y'all doing to put those pieces of shit in prison who are hurting these women\n\nAgent:I've had several people close to me pass away tragically in the past few years. Thanks for asking. I'm just trusting God each day trusting he's in control.\n\nAgent:Hey, honestly, the best way we can solve this problem of evil pieces of shit harming these girls is to dry up the demand for these girls by encouraging men like us to stop calling these ads. If the demand dries up, evil predatory scums' business with die.\n\nAgent:Can I get your commitment to do your part and stop engaging these girls on their escort ads?\n\nVisitor:Yes I will but I can promise you that it will not stop matter of fact it will get worse ... They will beat them more and make them do it more only way is\n\nVisitor:to put the pimps in prison for alot of years....\n\nAgent:Hey, I'll be praying for you. Feel free to call me any time if you need to talk. My number is (615) 628-7545. God bless you, brother.\n\nVisitor:Thank u\n\n", + "Example 3": "Visitor:Heyyy\n\nVisitor:How are you this evening\n\nAgent:better now ;) call me\n\nVisitor:Iam at work for now ,be off around 10pm\n\nVisitor:Need some company\n\nVisitor:Are you independent honey\n\nAgent:well since you arent available at the moment ill just come out and say-these sites are bad news. did you know that most of the girls on here are here against their will? Most of them got dragged into this lifestyle by an abuser, oftentimes before they were of legal consenting age. isnt that sad?\n\nAgent:we are with some guys who are trying to spread awareness of the realities of this \"industry\".\n\nAgent:https://exoduscry.com/choice/\n\nVisitor:Thanks\n\nAgent:i encourage you to watch this video. it is jarring to think about how bad someone else's options must be to choose to be on these sites\n\nVisitor:Ooohhh\n\nAgent:selling their body to make ends meet or appease a pimp\n\nVisitor:That's really awful\n\nAgent:it is. you seem like the kind of guy who wouldnt wont to proliferate that kind of harmful lifestyle. am i right in thinking that?\n\nVisitor:Well iam just looking for attention\n\nVisitor:My marriage is not going well lol\n\nAgent:i know that it is hard to find ourselves lonely and without much alternative to meet that perceived need but its humbling to think that our needs can force someone else into such a dark place\n\nAgent:hey, thanks for sharing that my man. i know it can be hard\n\nAgent:marraige is the most humbling of relationships, isnt it?\n\nVisitor:She leaves with her friends n no time for me\n\nAgent:ive been there my guy. i know that it is alot easier to numb that loneliness for sure\n\nVisitor:I want to be faithful\n\nAgent:does your wife know how you feel when she chooses her friends instead of you?\n\nVisitor:I been drinking lately\n\nVisitor:Yes , she takes pills\n\nAgent:if so, i hope you are praying for her to realize the hurt she is causing and to seek change\n\nVisitor:She had surgery 4 yes ago n it's been hard for her n her addiction on pills\n\nVisitor:Yes for now iam looking for a female friend to talk n see what can we do for each other\n\nAgent:that is hard my man. physical pain is a huge obstacle in life for sure so i hear you\n\nVisitor:Well chat later .thanks\n\nAgent:have you considered pursuing other men who can encourage you instead of looking for the easy way out?\n\nAgent:what is your name my friend? i will be praying for you by name if you wouldnt mind sharing it\n\nAgent:well, i gotta run. watch that video i sent and i will definitely be praying for you. I hope you pray for yourself and for your wife - God can definitely intervene and cause complete change in the situation if He wills it. He is good and He hears you. You are loved by Him, brother. Good night\n\n", +} diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/preprocessing/preprocess.py b/preprocessing/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..f4baa509dde0345fbab21a5cbda4effc001583ec --- /dev/null +++ b/preprocessing/preprocess.py @@ -0,0 +1,276 @@ +import os +import pickle +import random +import pandas as pd + +SPEAKERA_ROLE_MAP = {"Agent": 0, "Visitor": 1} + +LABEL_MAP = { + "Curiosity": 0, + "Obscene": 1, + "Informative": 2, + "Openness": 3, + "Acceptance": 4, + "Interest": 5, + "Greeting": 6, + "Disapproval": 7, + "Denial": 8, + "Anxious": 9, + "Uninterested": 10, + "Remorse": 11, + "Confused": 12, + "Accusatory": 13, + "Annoyed": 14, +} + + +def process_user_input(input: str): + """Parse the user input and return a list of row where each row is a list with + format `[, , ]`. + + Args: + input (str): the input of the user with each line has the format of + `:`. Only one message per line. + + Returns: + dict: a dictionary containing whether the input was successfully processed and + if so, the processed data of the input. + """ + if input == None or input == "": + return {"success": False, "message": "Input must not be an empty string!"} + + data = [] + for line in input.split("\n"): + if line == "": + continue + try: + speaker, message = line.split(":", 1) + + if speaker != "Agent" and speaker != "Visitor": + return {"success": False, "message": f"Invalid speaker {speaker}"} + + # Assuming there's only one input conversation + # Give it a dummy conversation id of epik_0 + data.append(["epik_0", speaker, message]) + except: + return {"success": False, "message": "Invalid Input"} + + return { + "success": True, + "message": "Success", + "data": data, + } + + +def encode_speaker_role(role): + return SPEAKERA_ROLE_MAP.get(role, 1) + + +def decode_speaker_role(role_numeric): + for role, numeric_val in SPEAKERA_ROLE_MAP.items(): + if role_numeric == numeric_val: + return role + + return "Unknow Speaker" + + +def encode_sentiment_label(label): + return LABEL_MAP.get(label, -1) + + +def decode_numeric_label(label_numeric): + for label, numeric_val in LABEL_MAP.items(): + if label_numeric == numeric_val: + return label + + return "Unknow Label" + + +def preapre_csv(data: list[list], output_path: str, with_label: bool = False): + """ + Process and group the speakers, messages, and labels (if any) by conversation + ids. This function is useful to prepare the neccesary csv file before converting it into + pickle file. + + + Args: + data (list[list]): A list contains the rows of a dataframe. Each row contains + values representing the coversation id, speaker role, message (, and label if any) in this order. + output_path (str): path to write the csv file. + with_label (bool, optional): Whether the input data contains labels (ie, for + training) or not (ie, for making predictions on a new sample). Defaults to False. + """ + columns = ["ConversationId", "ParticipantRole", "Text"] + + if with_label: + columns += ["Label"] + + df = pd.DataFrame(data=data, columns=columns) + + # encode the participant role + df["ParticipantRoleEncoded"] = df["ParticipantRole"].apply( + lambda role: encode_speaker_role(role) + ) + + # encode the labels + if with_label: + df["LabelNumeric"] = df["Label"].apply( + lambda label: encode_sentiment_label(label) + ) + else: + # Give the new input dummy labels to match the model input shape + df["LabelNumeric"] = df["ParticipantRole"].apply(lambda _: -1) + + # group the data into list based on conversation id + agg_params = {"Label": list} if with_label else {} + agg_params.update( + { + "ParticipantRole": list, + "ParticipantRoleEncoded": list, + "Text": list, + "LabelNumeric": list, + } + ) + grouped_df = df.groupby("ConversationId").agg(agg_params).reset_index() + + grouped_df.to_csv(output_path, index=False, encoding="ascii") + + return grouped_df + + +def convert_to_pickle( + source: str, + dest: str, + index_col: str = None, + list_type_columns: list = [], + order=[], + exclude=[], + single_tuple=False, +): + """Convert a csv file into a pickle file with format + col1, col2, ..., coln + + Args: + source (str): path to csv file + dest (str): the location where the pickle file will be stored + index_col (str): the column with unique ids that serves as index. Default to + None + order (list, optional): specify the order for one or many columns from left to + right, followed by columns not in order. + exclude (list, optional): columns to be excluded from the result. Defaults to + []. + single_tuple (bool): whether or not to output as tuple if there is only one + single column. Default to False. + """ + df = pd.read_csv(source) + df = df.drop(columns=exclude) + + # convert column from string representation of a list to list + for col in list_type_columns: + if col in df.columns: + df[col] = df[col].fillna("[]").apply(lambda x: eval(x)) + + if index_col != None: + df = df.set_index(index_col) + + # reorder the columns + if order != []: + left = df[order] + right = df[[col for col in df.columns if col not in order]] + df = pd.concat([left, right], axis=1) + + output = () + for col in df.columns: + output += (df[col].to_dict(),) + + if not single_tuple and len(output) == 1: + output = output[0] + + with open(dest, "wb") as f: + pickle.dump(output, f) + f.close() + + return + + +def split_and_save_ids( + ids, train_ratio=0.8, test_ratio=0.1, valid_ratio=0.1, dir=".", seed=None +): + """ + Randomly split a list of IDs into training, testing, and validation sets and save them to text files. + + Args: + ids (list): List of IDs to be split. + train_ratio (float): Ratio of IDs for the training set (default is 0.8). + test_ratio (float): Ratio of IDs for the testing set (default is 0.1). + valid_ratio (float): Ratio of IDs for the validation set (default is 0.1). + dir (str): the path to the directory to save the files for ids + seed (int): Seed for randomization (default is None). + + Returns: + train_set (list): List of IDs in the training set. + test_set (list): List of IDs in the testing set. + valid_set (list): List of IDs in the validation set. + """ + + # Check if the ratios add up to 1.0 + assert train_ratio + test_ratio + valid_ratio == 1.0, "Ratios should add up to 1.0" + + # Set random seed for reproducibility + if seed is not None: + random.seed(seed) + + # Shuffle the list of IDs + random.shuffle(ids) + + # Calculate the split points + train_split = int(len(ids) * train_ratio) + test_split = train_split + int(len(ids) * test_ratio) + + # Split the IDs + train_set = ids[:train_split] + test_set = ids[train_split:test_split] + valid_set = ids[test_split:] + + # Save the sets to text files + def save_to_txt(file_path, id_set): + with open(file_path, "w") as file: + id_strings = [str(conv_id) for conv_id in id_set] + file.write("\n".join(id_strings)) + + save_to_txt(os.path.join(dir, "train_set.txt"), train_set) + save_to_txt(os.path.join(dir, "test_set.txt"), test_set) + save_to_txt(os.path.join(dir, "validation_set.txt"), valid_set) + + return train_set, test_set, valid_set + + +def merge_pkl_with_ids(pickle_src: str, ids_files: list, dir: str = "."): + """Merge an existing pickle file with id files, resulting in a pickle file with 3 + more fields of train_ids, test_ids, and valid_ids. + + Args: + pickle_src (str): the path to the pickle file + ids_files (list): list of files that contain ids. Example: + ["train_set.txt", "test_set.txt", "validation_set.txt"]. Each file should + contain one single unique id on each line. + dir (str, optional): the directory for ids_files. Defaults to ''. + """ + ids_set = () + for filename in ids_files: + ids = [] + path = os.path.join(dir, filename) + with open(path, "r") as file: + for line in file: + ids.append(line.strip()) + + ids_set += (ids,) + + with open(pickle_src, "rb") as file: + data = pickle.load(file) + data += ids_set + file.close() + + with open(pickle_src, "wb") as file: + pickle.dump(data, file) + file.close()