File size: 3,482 Bytes
0a16720
 
1119202
fc4d00d
bfb1fa6
fc4d00d
bfb1fa6
fc4d00d
 
 
bfb1fa6
 
fc4d00d
 
 
 
bfb1fa6
fc4d00d
bfb1fa6
fc4d00d
bfb1fa6
fc4d00d
 
 
 
bfb1fa6
fc4d00d
 
 
bfb1fa6
 
fc4d00d
 
bfb1fa6
fc4d00d
 
bfb1fa6
fc4d00d
bfb1fa6
fc4d00d
bfb1fa6
fc4d00d
bfb1fa6
fc4d00d
bfb1fa6
fc4d00d
 
 
 
bfb1fa6
 
 
 
 
 
 
 
 
 
 
 
fc4d00d
bfb1fa6
fc4d00d
 
e08cc47
fc4d00d
bfb1fa6
 
 
 
fc4d00d
bfb1fa6
 
 
 
 
fc4d00d
bfb1fa6
fc4d00d
 
bfb1fa6
fc4d00d
bfb1fa6
 
 
 
 
 
bbdeb84
 
 
 
 
 
 
 
 
 
fc4d00d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# UPDATE: NEW AND IMPROVED MODEL AVAILABLE AT https://huggingface.co./maxpe/twitter-roberta-base-jun2022_sem_eval_2018_task_1

# Twitter-roBERTa-base_SemEval18_Emodetection

This is a Twitter-roBERTa-base model trained on ~7000 tweets in English annotated for 11 emotion categories in [SemEval-2018 Task 1: Affect in Tweets: SubTask 5: Emotion Classification](https://competitions.codalab.org/competitions/17751). 

Run the classifier on the test set of the competition:

```python
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import torch
import pandas as pd

# choose GPU when available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base",model_max_length=512)

# build custom model with classification layer on top and a dropout layer before
class RobertaClass(torch.nn.Module):
    
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base",return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 11)
        
    def forward(self, input_ids, attention_mask):
        _, output_1= self.l1(input_ids=input_ids, attention_mask=attention_mask)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        
        return output

model_name="twitter-roberta-base_semeval18_emodetection/pytorch_model.bin"

model=RobertaClass()

model.load_state_dict(torch.load(model_name,map_location=torch.device(device)))

model.eval()

# run on more than 1 GPU
model = torch.nn.DataParallel(model)

model.to(device)

twnames=['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']

# load from hugging face dataset hub
testset_raw = load_dataset('sem_eval_2018_task_1','subtask5.english',split='test')

# remove old columns
testset=testset_raw.remove_columns(twnames+["ID"])

# tokenize
testset_tokenized = testset.map(lambda e: tokenizer(e['Tweet'], truncation=True, padding='max_length'), batched=True)

testset_tokenized=testset_tokenized.remove_columns("Tweet")

testset_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])


outfile="predicted_2018-E-c-En-test-gold.txt"

MAX_LEN = 512
VALID_BATCH_SIZE = 8
# set batch size according to available RAM
# VALID_BATCH_SIZE = 1000

# set num_workers for parallel processing
inference_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                # 'num_workers': 1
                }

inference_loader = DataLoader(testset_tokenized, **inference_params)


open(outfile,"w").close()
with torch.no_grad():
    # change lines for progress manager
    # for _, data in tqdm(enumerate(inference_loader, 0),total=len(inference_loader)):
    for _, data in enumerate(inference_loader, 0):
        outputs = model(input_ids=data['input_ids'],attention_mask=data['attention_mask'])
        fin_outputs=torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        pd.DataFrame(fin_outputs).to_csv(outfile,index=False,header=False,sep="\t",mode='a')
        
        
# # dataset from file (one text per line)
# from datasets import Dataset

# with open(linesoftextfile,"rb") as textfile:
#     textdict={"text":[x.decode().rstrip("\n") for x in textfile.readlines()]}
    
# inference_dataset=Dataset.from_dict(textdict)
# del(textdict)
```