File size: 3,482 Bytes
0a16720 1119202 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d e08cc47 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 fc4d00d bfb1fa6 bbdeb84 fc4d00d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# UPDATE: NEW AND IMPROVED MODEL AVAILABLE AT https://huggingface.co./maxpe/twitter-roberta-base-jun2022_sem_eval_2018_task_1
# Twitter-roBERTa-base_SemEval18_Emodetection
This is a Twitter-roBERTa-base model trained on ~7000 tweets in English annotated for 11 emotion categories in [SemEval-2018 Task 1: Affect in Tweets: SubTask 5: Emotion Classification](https://competitions.codalab.org/competitions/17751).
Run the classifier on the test set of the competition:
```python
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import torch
import pandas as pd
# choose GPU when available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base",model_max_length=512)
# build custom model with classification layer on top and a dropout layer before
class RobertaClass(torch.nn.Module):
def __init__(self):
super(RobertaClass, self).__init__()
self.l1 = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base",return_dict=False)
self.l2 = torch.nn.Dropout(0.3)
self.l3 = torch.nn.Linear(768, 11)
def forward(self, input_ids, attention_mask):
_, output_1= self.l1(input_ids=input_ids, attention_mask=attention_mask)
output_2 = self.l2(output_1)
output = self.l3(output_2)
return output
model_name="twitter-roberta-base_semeval18_emodetection/pytorch_model.bin"
model=RobertaClass()
model.load_state_dict(torch.load(model_name,map_location=torch.device(device)))
model.eval()
# run on more than 1 GPU
model = torch.nn.DataParallel(model)
model.to(device)
twnames=['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']
# load from hugging face dataset hub
testset_raw = load_dataset('sem_eval_2018_task_1','subtask5.english',split='test')
# remove old columns
testset=testset_raw.remove_columns(twnames+["ID"])
# tokenize
testset_tokenized = testset.map(lambda e: tokenizer(e['Tweet'], truncation=True, padding='max_length'), batched=True)
testset_tokenized=testset_tokenized.remove_columns("Tweet")
testset_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
outfile="predicted_2018-E-c-En-test-gold.txt"
MAX_LEN = 512
VALID_BATCH_SIZE = 8
# set batch size according to available RAM
# VALID_BATCH_SIZE = 1000
# set num_workers for parallel processing
inference_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': False,
# 'num_workers': 1
}
inference_loader = DataLoader(testset_tokenized, **inference_params)
open(outfile,"w").close()
with torch.no_grad():
# change lines for progress manager
# for _, data in tqdm(enumerate(inference_loader, 0),total=len(inference_loader)):
for _, data in enumerate(inference_loader, 0):
outputs = model(input_ids=data['input_ids'],attention_mask=data['attention_mask'])
fin_outputs=torch.sigmoid(outputs).cpu().detach().numpy().tolist()
pd.DataFrame(fin_outputs).to_csv(outfile,index=False,header=False,sep="\t",mode='a')
# # dataset from file (one text per line)
# from datasets import Dataset
# with open(linesoftextfile,"rb") as textfile:
# textdict={"text":[x.decode().rstrip("\n") for x in textfile.readlines()]}
# inference_dataset=Dataset.from_dict(textdict)
# del(textdict)
``` |