ypesk commited on
Commit
b18c63e
·
verified ·
1 Parent(s): 25c0499

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +86 -41
tasks/text.py CHANGED
@@ -21,6 +21,63 @@ router = APIRouter()
21
  DESCRIPTION = "First Baseline"
22
  ROUTE = "/text"
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  @router.post(ROUTE, tags=["Text Task"],
25
  description=DESCRIPTION)
26
  async def evaluate_text(request: TextEvaluationRequest):
@@ -64,53 +121,41 @@ async def evaluate_text(request: TextEvaluationRequest):
64
  # YOUR MODEL INFERENCE CODE HERE
65
  # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
66
  #--------------------------------------------------------------------------------------------
67
- class CovidTwitterBertClassifier(
68
- nn.Module,
69
- PyTorchModelHubMixin,
70
- # optionally, you can add metadata which gets pushed to the model card
71
- ):
72
- def __init__(self, num_classes):
73
- super().__init__()
74
- self.n_classes = num_classes
75
- self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')
76
- self.bert.cls.seq_relationship = nn.Linear(1024, num_classes)
77
-
78
- self.sigmoid = nn.Sigmoid()
79
-
80
- def forward(self, input_ids, token_type_ids, input_mask):
81
- outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
82
-
83
- logits = outputs[1]
84
-
85
- return logits
86
- model = CovidTwitterBertClassifier.from_pretrained("ypesk/ct-baseline")
87
- model.eval()
88
-
89
-
90
- tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')
91
-
92
- test_texts = [t['quote'] for t in test_dataset]
93
 
94
- MAX_LEN = 256 #1024 # < m some tweets will be truncated
95
-
96
- tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
97
- test_input_ids, test_token_type_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['token_type_ids'], tokenized_test['attention_mask']
98
- test_token_type_ids = torch.tensor(test_token_type_ids)
 
 
 
 
 
99
 
100
- test_input_ids = torch.tensor(test_input_ids)
101
- test_attention_mask = torch.tensor(test_attention_mask)
102
-
103
- batch_size = 12 #
104
- test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids)
 
 
 
105
 
106
- test_sampler = SequentialSampler(test_data)
107
- test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
 
 
 
108
 
 
109
  predictions = []
110
- c=0
111
  for batch in tqdm(test_dataloader):
112
- print(c)
113
- c+=1
114
 
115
  b_input_ids, b_input_mask, b_token_type_ids = batch
116
  with torch.no_grad():
 
21
  DESCRIPTION = "First Baseline"
22
  ROUTE = "/text"
23
 
24
+
25
+ MODEL = "mlp" #mlp, ct, modern
26
+
27
+ class ConspiracyClassification(
28
+ nn.Module,
29
+ PyTorchModelHubMixin,
30
+ # optionally, you can add metadata which gets pushed to the model card
31
+ ):
32
+ def __init__(self, num_classes):
33
+ super().__init__()
34
+ self.h1 = nn.Linear(384, 100)
35
+ self.h2 = nn.Linear(100, 100)
36
+ self.h3 = nn.Linear(100, 100)
37
+ self.h4 = nn.Linear(100, 50)
38
+ self.h5 = nn.Linear(50, num_classes)
39
+ self.dropout = nn.Dropout(0.2)
40
+ self.activation = nn.ReLU()
41
+
42
+
43
+ def forward(self, input_texts):
44
+ outputs = self.h1(input_texts)
45
+ outputs = self.activation(outputs)
46
+ outputs = self.dropout(outputs)
47
+ outputs = self.h2(outputs)
48
+ outputs = self.activation(outputs)
49
+ outputs = self.dropout(outputs)
50
+ outputs = self.h3(outputs)
51
+ outputs = self.activation(outputs)
52
+ outputs = self.dropout(outputs)
53
+ outputs = self.h4(outputs)
54
+ outputs = self.activation(outputs)
55
+ outputs = self.dropout(outputs)
56
+ outputs = self.h5(outputs)
57
+
58
+ return outputs
59
+
60
+ class CovidTwitterBertClassifier(
61
+ nn.Module,
62
+ PyTorchModelHubMixin,
63
+ # optionally, you can add metadata which gets pushed to the model card
64
+ ):
65
+ def __init__(self, num_classes):
66
+ super().__init__()
67
+ self.n_classes = num_classes
68
+ self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')
69
+ self.bert.cls.seq_relationship = nn.Linear(1024, num_classes)
70
+
71
+ self.sigmoid = nn.Sigmoid()
72
+
73
+ def forward(self, input_ids, token_type_ids, input_mask):
74
+ outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
75
+
76
+ logits = outputs[1]
77
+
78
+ return logits
79
+
80
+
81
  @router.post(ROUTE, tags=["Text Task"],
82
  description=DESCRIPTION)
83
  async def evaluate_text(request: TextEvaluationRequest):
 
121
  # YOUR MODEL INFERENCE CODE HERE
122
  # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
123
  #--------------------------------------------------------------------------------------------
124
+ if MODEL =="mlp":
125
+ model = ConspiracyClassification.from_pretrained("ypesk/frugal-ai-mlp-baseline")
126
+
127
+ emb_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
128
+ batch_size = 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ test_texts = torch.Tensor(emb_model.encode([t['quote'] for t in test_dataset]))
131
+ test_data = TensorDataset(test_texts)
132
+ test_sampler = SequentialSampler(test_data)
133
+ test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
134
+
135
+ elif MODEL == "ct":
136
+ model = CovidTwitterBertClassifier.from_pretrained("ypesk/ct-baseline")
137
+ tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')
138
+
139
+ test_texts = [t['quote'] for t in test_dataset]
140
 
141
+ MAX_LEN = 256 #1024 # < m some tweets will be truncated
142
+
143
+ tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
144
+ test_input_ids, test_token_type_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['token_type_ids'], tokenized_test['attention_mask']
145
+ test_token_type_ids = torch.tensor(test_token_type_ids)
146
+
147
+ test_input_ids = torch.tensor(test_input_ids)
148
+ test_attention_mask = torch.tensor(test_attention_mask)
149
 
150
+ batch_size = 12 #
151
+ test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids)
152
+
153
+ test_sampler = SequentialSampler(test_data)
154
+ test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
155
 
156
+ model.eval()
157
  predictions = []
 
158
  for batch in tqdm(test_dataloader):
 
 
159
 
160
  b_input_ids, b_input_mask, b_token_type_ids = batch
161
  with torch.no_grad():