maxpe commited on
Commit
fc4d00d
·
1 Parent(s): 11c2b2b

added README

Browse files
Files changed (1) hide show
  1. README.md +91 -0
README.md ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Twitter-roBERTa-base
2
+
3
+ This is a Twitter-roBERTa-base model trained on ~7000 tweets annotated for 11 emotion categories in [SemEval-2018 Task 1: Affect in Tweets: SubTask 5: Emotion Classification.](https://competitions.codalab.org/competitions/17751).
4
+
5
+ Run the example script below like that.
6
+
7
+ ```bash
8
+ python3 predict_11emoclasses.py testfile
9
+ ```
10
+
11
+ ```python
12
+ #!/usr/bin/env python3
13
+ # -*- coding: utf-8 -*-
14
+ """
15
+ Created on Wed Aug 4 17:56:24 2021
16
+
17
+ @author: maxpe
18
+ """
19
+
20
+ import transformers
21
+
22
+ from datasets import load_dataset
23
+
24
+ from transformers import AutoTokenizer, AutoConfig
25
+
26
+ import torch
27
+
28
+ from tqdm import tqdm
29
+
30
+ from torch import cuda
31
+
32
+ import pandas as pd
33
+
34
+ import sys
35
+
36
+ # choose GPU when available
37
+ device = 'cuda' if cuda.is_available() else 'cpu'
38
+
39
+
40
+ file=sys.argv[1]
41
+
42
+ class RobertaClass(torch.nn.Module):
43
+
44
+ def __init__(self):
45
+ super(RobertaClass, self).__init__()
46
+ self.l1 = transformers.RobertaModel.from_pretrained("cardiffnlp/twitter-roberta-base")
47
+ self.l2 = torch.nn.Dropout(0.3)
48
+ self.l3 = torch.nn.Linear(768, 11)
49
+
50
+ def forward(self, ids, mask):
51
+ _, output_1= self.l1(ids, attention_mask = mask)
52
+ output_2 = self.l2(output_1)
53
+ output = self.l3(output_2)
54
+ return output
55
+
56
+
57
+ model=transformers.AutoModel.from_pretrained("maxpe/twitter-roberta-base_semeval18_emodetection")
58
+
59
+ model.config=transformers.RobertaConfig.from_pretrained("cardiffnlp/twitter-roberta-base")
60
+
61
+ model.eval() # set model to eval mode
62
+
63
+ model = torch.nn.DataParallel(model)
64
+
65
+ model.to(device)
66
+
67
+
68
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base",model_max_length=512)
69
+
70
+
71
+ dataset = load_dataset('text', data_files={'test': file})
72
+
73
+ dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length'), batched=True)
74
+
75
+ dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
76
+
77
+ # Make this smaller when you get a memory error
78
+ BATCH_SIZE=32
79
+
80
+ dataloader = torch.utils.data.DataLoader(dataset['test'], batch_size=BATCH_SIZE)
81
+
82
+ open(file+"_11emo","w").close()
83
+
84
+ with torch.no_grad():
85
+ # exchange the commented lines if you want to have a progress manager
86
+ # for _, data in tqdm(enumerate(dataloader, 0),total=len(dataloader)):
87
+ for _, data in enumerate(dataloader, 0):
88
+ outputs = model(data['input_ids'],data['attention_mask'])
89
+ fin_outputs=torch.sigmoid(outputs).tolist()
90
+ pd.DataFrame(fin_outputs).to_csv(file+"_11emo",index=False,header=False,sep="\t",mode='a')
91
+ ```