thushalya commited on
Commit
861ab00
1 Parent(s): 7f0eec2

Add predicted_class as hate speech value

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. app.py +368 -3
  3. model.pt +3 -0
  4. requirements.txt +82 -2
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /env
2
+ /*env
app.py CHANGED
@@ -1,7 +1,372 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel
4
+ import re
5
+ from textblob import TextBlob
6
+ from nltk import pos_tag, word_tokenize
7
+ from nltk.corpus import stopwords
8
+ import emoji
9
+ import string
10
+ import nltk
11
+ from nltk import pos_tag
12
+ from nltk.tokenize import word_tokenize
13
+ from nltk.corpus import stopwords
14
+ import textstat
15
+ import pandas as pd
16
+ from transformers import pipeline
17
+ from torch.utils.data import Dataset, DataLoader
18
+ import torch.nn as nn
19
 
20
+
21
+
22
+
23
+
24
+
25
+
26
+ #Loading author details
27
+ def average_word_length(tweet):
28
+ words = tweet.split()
29
+ return sum(len(word) for word in words) / len(words)
30
+
31
+
32
+ def lexical_diversity(tweet):
33
+ words = tweet.split()
34
+ unique_words = set(words)
35
+ return len(unique_words) / len(words)
36
+
37
+ def count_capital_letters(tweet):
38
+ return sum(1 for char in tweet if char.isupper())
39
+
40
+ def count_words_surrounded_by_colons(tweet):
41
+ # Define a regular expression pattern to match words surrounded by ':'
42
+ pattern = r':(\w+):'
43
+
44
+ # Use re.findall to find all matches in the tweet
45
+ matches = re.findall(pattern, tweet)
46
+
47
+ # Return the count of matched words
48
+ return len(matches)
49
+
50
+ def count_emojis(tweet):
51
+ # Convert emoji symbols to their corresponding names
52
+ tweet_with_names = emoji.demojize(tweet)
53
+ return count_words_surrounded_by_colons(tweet_with_names)
54
+
55
+ def hashtag_frequency(tweet):
56
+ hashtags = re.findall(r'#\w+', tweet)
57
+ return len(hashtags)
58
+
59
+ def mention_frequency(tweet):
60
+ mentions = re.findall(r'@\w+', tweet)
61
+ return len(mentions)
62
+
63
+ def count_special_characters(tweet):
64
+ special_characters = [char for char in tweet if char in string.punctuation]
65
+ return len(special_characters)
66
+
67
+
68
+ def stop_word_frequency(tweet):
69
+ stop_words = set(stopwords.words('english'))
70
+ words = [word for word in tweet.split() if word.lower() in stop_words]
71
+ return len(words)
72
+
73
+ nltk.download('punkt')
74
+ nltk.download('averaged_perceptron_tagger')
75
+ nltk.download('stopwords')
76
+
77
+ def get_linguistic_features(tweet):
78
+ # Tokenize the tweet
79
+ words = word_tokenize(tweet)
80
+
81
+ # Remove stopwords
82
+ stop_words = set(stopwords.words('english'))
83
+ filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
84
+
85
+ # Get parts of speech tags
86
+ pos_tags = pos_tag(filtered_words)
87
+
88
+ # Count various linguistic features
89
+ noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N'))
90
+ verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V'))
91
+ participle_count = sum(1 for word, pos in pos_tags if pos.startswith('V') and ('ing' in word or 'ed' in word))
92
+ interjection_count = sum(1 for word, pos in pos_tags if pos == 'UH')
93
+ pronoun_count = sum(1 for word, pos in pos_tags if pos.startswith('PRP'))
94
+ preposition_count = sum(1 for word, pos in pos_tags if pos.startswith('IN'))
95
+ adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB'))
96
+ conjunction_count = sum(1 for word, pos in pos_tags if pos.startswith('CC'))
97
+
98
+ return {
99
+ 'Noun_Count': noun_count,
100
+ 'Verb_Count': verb_count,
101
+ 'Participle_Count': participle_count,
102
+ 'Interjection_Count': interjection_count,
103
+ 'Pronoun_Count': pronoun_count,
104
+ 'Preposition_Count': preposition_count,
105
+ 'Adverb_Count': adverb_count,
106
+ 'Conjunction_Count': conjunction_count
107
+ }
108
+
109
+ def readability_score(tweet):
110
+ return textstat.flesch_reading_ease(tweet)
111
+
112
+ def get_url_frequency(tweet):
113
+ urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet)
114
+ return len(urls)
115
+
116
+
117
+ # Define a function to extract features from a single tweet
118
+ def extract_features(tweet):
119
+ features = {
120
+ 'Average_Word_Length': average_word_length(tweet),
121
+ # 'Average_Sentence_Length': average_sentence_length(tweet),
122
+ 'Lexical_Diversity': lexical_diversity(tweet),
123
+ 'Capital_Letters_Count': count_capital_letters(tweet), # Uncomment if you want to include this feature
124
+ 'Hashtag_Frequency': hashtag_frequency(tweet),
125
+ 'Mention_Frequency': mention_frequency(tweet),
126
+ 'count_emojis': count_emojis(tweet),
127
+ 'special_chars_count': count_special_characters(tweet),
128
+ 'Stop_Word_Frequency': stop_word_frequency(tweet),
129
+ **get_linguistic_features(tweet), # Include linguistic features
130
+ 'Readability_Score': readability_score(tweet),
131
+ 'URL_Frequency': get_url_frequency(tweet) # Assuming you have the correct function for this
132
+ }
133
+ return features
134
+
135
+ # # Extract features for all tweets
136
+ # features_list = [extract_features(tweet) for tweet in X['text']]
137
+
138
+ # # Create a Pandas DataFrame
139
+ # X_new = pd.DataFrame(features_list)
140
+
141
+
142
+
143
+ # Loading personality model
144
+
145
+ def personality_detection(text, threshold=0.05, endpoint= 1.0):
146
+ tokenizer = AutoTokenizer.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN)
147
+ model = AutoModelForSequenceClassification.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN)
148
+
149
+ inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
150
+ outputs = model(**inputs)
151
+ predictions = outputs.logits.squeeze().detach().numpy()
152
+
153
+ # Get raw logits
154
+ logits = model(**inputs).logits
155
+
156
+ # Apply sigmoid to squash between 0 and 1
157
+ probabilities = torch.sigmoid(logits)
158
+
159
+ # # Set values less than the threshold to 0.05
160
+ # predictions[predictions < threshold] = 0.05
161
+ # predictions[predictions > endpoint] = 1.0
162
+ # print("per",probabilities[0][0].detach().numpy())
163
+ # print("per",probabilities[0][1].detach().numpy())
164
+ # print("per",probabilities[0][2].detach().numpy())
165
+ # print("per",probabilities[0][3].detach().numpy())
166
+ # print("per",probabilities[0][4].detach().numpy())
167
+
168
+ # label_names = ['Agreeableness', 'Conscientiousness', 'Extraversion', 'Neuroticism', 'Openness']
169
+ # # result = {label_names[i]: f"{predictions[i]*100:.0f}%" for i in range(len(label_names))}
170
+ # result = {label_names[i]: f"{probabilities}%" for i in range(len(label_names))}
171
+ # probabilities
172
+ return [probabilities[0][0].detach().numpy()
173
+ ,probabilities[0][1].detach().numpy()
174
+ ,probabilities[0][2].detach().numpy()
175
+ ,probabilities[0][3].detach().numpy()
176
+ ,probabilities[0][4].detach().numpy()]
177
+
178
+
179
+ # tokenizer = AutoTokenizer.from_pretrained("Nasserelsaman/microsoft-finetuned-personality")
180
+ # model = AutoModelForSequenceClassification.from_pretrained("Nasserelsaman/microsoft-finetuned-personality")
181
+
182
+ #Loading emotion model
183
+
184
+ # tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")
185
+ # model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")
186
+
187
+ ##use this for gpu
188
+ # pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True,device=device )
189
+
190
+ ##use this for cpu
191
+ def calc_emotion_score(tweet):
192
+ pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True )
193
+ emotions = pipe(tweet)[0]
194
+ for i in emotions:
195
+ print(i)
196
+
197
+ return [emotions[0]['score'],emotions[1]['score'],emotions[2]['score'],emotions[3]['score'],emotions[4]['score'],emotions[5]['score'],emotions[6]['score'],emotions[7]['score'],emotions[8]['score'],emotions[9]['score'],emotions[10]['score']]
198
+
199
+
200
+
201
+
202
+
203
+
204
+ #DCL model launching
205
+
206
+ def load_model(tweet):
207
+ # model = torch.load("./authormodel.pt",map_location ='cpu')
208
+ # print(model)
209
+
210
+ model_name = "vinai/bertweet-base"
211
+ PADDING_MAX_LENGTH = 45
212
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
213
+ inputs = tokenizer(tweet, truncation=True, padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True, return_tensors="pt")
214
+ print(inputs)
215
+ emotion_list = calc_emotion_score(tweet)
216
+ print(emotion_list)
217
+
218
+ features_list = extract_features(tweet)
219
+ for i in features_list.values():
220
+ emotion_list.append(i)
221
+ print("emotion + author",emotion_list)
222
+ # print()
223
+ # print(features_list)
224
+ personality_list = personality_detection(tweet)
225
+ print("personality",personality_list)
226
+ # person_list = [personality_list["Extraversion"],personality_list['Neuroticism'],personality_list['Agreeableness'],personality_list['Conscientiousness'],personality_list['Openness']]
227
+ emotion_list.extend(personality_list)
228
+ print("final list",emotion_list)
229
+ # print(str(features_list["Average_Word_Length"]))
230
+ inputs['emotion_author_vector'] = torch.tensor([emotion_list])
231
+
232
+ print("final inputs ",inputs)
233
+
234
+
235
+ # []
236
+ # inputs["emotion_author_vector"] =
237
+ # train_dataloader=DataLoader(inputs, batch_size=1 , shuffle=False)
238
+ # print(train_dataloader)
239
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
240
+ # def tokenize_function(examples):
241
+ # return tokenizer.batch_encode_plus(examples["text"], padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True,truncation=True)
242
+ class EmotionAuthorGuidedDCLModel(nn.Module):
243
+ def __init__(self,dcl_model:nn.Module,dropout:float=0.5):
244
+ super(EmotionAuthorGuidedDCLModel, self).__init__()
245
+ self.dcl_model = dcl_model
246
+ self.dim = 802
247
+ self.dropout = nn.Dropout(dropout)
248
+ self.linear = nn.Linear(self.dim, 1)
249
+ # Freeze all layers
250
+ for param in self.dcl_model.parameters():
251
+ param.requires_grad = False
252
+
253
+ def forward(self,batch_tokenized):
254
+ input_ids = batch_tokenized['input_ids']
255
+ attention_mask = batch_tokenized['attention_mask']
256
+ emotion_vector = batch_tokenized['emotion_author_vector']
257
+ bert_output = self.dcl_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
258
+ bert_cls_hidden_state = bert_output[1]
259
+ combined_vector =torch.cat((bert_cls_hidden_state,emotion_vector), 1)
260
+ d_combined_vector=self.dropout(combined_vector)
261
+ linear_output = self.linear(d_combined_vector)
262
+ pred_linear = linear_output.squeeze(1)
263
+ return pred_linear
264
+ # twee
265
+
266
+ checkpoint = {
267
+ "model_state_dict":torch.load("./model.pt",map_location ='cpu') ,
268
+ }
269
+
270
+ # checkpoint=load_checkpoint(run=run_dcl_study,check_point_name="model_checkpoints/")
271
+
272
+ class DCLArchitecture(nn.Module):
273
+ def __init__(self,dropout:float,bert_model_name:str='vinai/bertweet-base'):
274
+ super(DCLArchitecture, self).__init__()
275
+ self.bert = AutoModel.from_pretrained(bert_model_name)
276
+ self.dim = 768
277
+ self.dense = nn.Linear(self.dim, 1)
278
+ self.dropout = nn.Dropout(dropout)
279
+
280
+ def forward(self,batch_tokenized, if_train=False):
281
+ input_ids = batch_tokenized['input_ids']
282
+ attention_mask = batch_tokenized['attention_mask']
283
+ bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
284
+ bert_cls_hidden_state = bert_output[1]
285
+ torch.cuda.empty_cache()
286
+
287
+ if if_train:
288
+ bert_cls_hidden_state_aug = self.dropout(bert_cls_hidden_state)
289
+ bert_cls_hidden_state = torch.cat((bert_cls_hidden_state, bert_cls_hidden_state_aug), dim=1).reshape(-1, self.dim)
290
+ else:
291
+ bert_cls_hidden_state = self.dropout(bert_cls_hidden_state)
292
+
293
+ linear_output = self.dense(bert_cls_hidden_state)
294
+ linear_output = linear_output.squeeze(1)
295
+
296
+ return bert_cls_hidden_state, linear_output
297
+
298
+
299
+ # dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=best_prams["DROPOUT"])
300
+ dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=0.5)
301
+ dcl_model.to(device)
302
+
303
+ DROPOUT = 0.5
304
+ fined_tuned_bert_model=dcl_model.bert
305
+ model = EmotionAuthorGuidedDCLModel(dcl_model=fined_tuned_bert_model,dropout=DROPOUT)
306
+ model.to(device)
307
+ model.load_state_dict(checkpoint["model_state_dict"])
308
+
309
+
310
+
311
+
312
+ # def test_loop(model, test_dataloader, device):
313
+ # # collection_metric = MetricCollection(
314
+ # # BinaryAccuracy(),
315
+ # # MulticlassPrecision(num_classes=2,average=average),
316
+ # # MulticlassRecall(num_classes=2,average=average),
317
+ # # MulticlassF1Score(num_classes=2,average=average),
318
+ # # BinaryConfusionMatrix()
319
+ # # )
320
+ # # collection_metric.to(device)
321
+ # model.eval()
322
+ # print(test_dataloader)
323
+ # # total_test_loss = 0.0
324
+ # for batch in test_dataloader:
325
+ # print(batch)
326
+ # batch = {k: v.to(device) for k, v in batch.items()}
327
+ # # labels = batch["labels"]
328
+ # with torch.no_grad():
329
+ # pred = model(batch)
330
+ # # loss = criteon(pred, labels.float())
331
+ # pred = torch.round(torch.sigmoid(pred))
332
+
333
+ # return pred
334
+ # result_metrics=test_loop(model=model, test_dataloader=train_dataloader,device=device)
335
+ # print("Hate speech result",result_metrics)
336
+
337
+ def predict_single_text(model, inputs,device):
338
+ # Preprocess the text
339
+ # inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
340
+ inputs = {k: v.to(device) for k, v in inputs.items()}
341
+
342
+ # Pass the preprocessed text through the model
343
+ with torch.no_grad():
344
+ model.eval()
345
+ pred = model(inputs)
346
+ # Assuming your model returns a single value for prediction
347
+ pred = torch.round(torch.sigmoid(pred)).item()
348
+
349
+ return pred
350
+
351
+ predicted_class = predict_single_text(model, inputs, device)
352
+ return predicted_class
353
+ # print("Hate speech result",predicted_class)
354
+
355
+
356
+
357
+
358
+ #Gradio interface
359
+ def greet(tweet):
360
+ print("start")
361
+ predicted_class = load_model(tweet)
362
+ # features_list = extract_features(tweet)
363
+ # print(personality_detection(tweet))
364
+ # print(str(features_list["Average_Word_Length"]))
365
+ # print(calc_emotion_score(tweet))
366
+ print("end")
367
+
368
+
369
+ return str(predicted_class)
370
 
371
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
372
+ demo.launch()
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a0522ff50dd3433230896898665a1b3a8d5fbaf72f5c2f6286a51e267f56b45
3
+ size 539673670
requirements.txt CHANGED
@@ -1,2 +1,82 @@
1
- gradio
2
- torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ altair==5.3.0
3
+ annotated-types==0.6.0
4
+ anyio==4.3.0
5
+ attrs==23.2.0
6
+ certifi==2024.2.2
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ colorama==0.4.6
10
+ contourpy==1.2.1
11
+ cycler==0.12.1
12
+ emoji==2.11.1
13
+ fastapi==0.110.3
14
+ ffmpy==0.3.2
15
+ filelock==3.14.0
16
+ fonttools==4.51.0
17
+ fsspec==2024.3.1
18
+ gradio==4.28.3
19
+ gradio_client==0.16.0
20
+ h11==0.14.0
21
+ httpcore==1.0.5
22
+ httpx==0.27.0
23
+ huggingface-hub==0.22.2
24
+ idna==3.7
25
+ importlib_resources==6.4.0
26
+ intel-openmp==2021.4.0
27
+ Jinja2==3.1.3
28
+ joblib==1.4.0
29
+ jsonschema==4.22.0
30
+ jsonschema-specifications==2023.12.1
31
+ kiwisolver==1.4.5
32
+ markdown-it-py==3.0.0
33
+ MarkupSafe==2.1.5
34
+ matplotlib==3.8.4
35
+ mdurl==0.1.2
36
+ mkl==2021.4.0
37
+ mpmath==1.3.0
38
+ networkx==3.3
39
+ nltk==3.8.1
40
+ numpy==1.26.4
41
+ orjson==3.10.2
42
+ packaging==24.0
43
+ pandas==2.2.2
44
+ pillow==10.3.0
45
+ pydantic==2.7.1
46
+ pydantic_core==2.18.2
47
+ pydub==0.25.1
48
+ Pygments==2.17.2
49
+ pyparsing==3.1.2
50
+ pyphen==0.15.0
51
+ python-dateutil==2.9.0.post0
52
+ python-multipart==0.0.9
53
+ pytz==2024.1
54
+ PyYAML==6.0.1
55
+ referencing==0.35.0
56
+ regex==2024.4.28
57
+ requests==2.31.0
58
+ rich==13.7.1
59
+ rpds-py==0.18.0
60
+ ruff==0.4.2
61
+ safetensors==0.4.3
62
+ semantic-version==2.10.0
63
+ shellingham==1.5.4
64
+ six==1.16.0
65
+ sniffio==1.3.1
66
+ starlette==0.37.2
67
+ sympy==1.12
68
+ tbb==2021.12.0
69
+ textblob==0.18.0.post0
70
+ textstat==0.7.3
71
+ tokenizers==0.19.1
72
+ tomlkit==0.12.0
73
+ toolz==0.12.1
74
+ torch==2.3.0
75
+ tqdm==4.66.2
76
+ transformers==4.40.1
77
+ typer==0.12.3
78
+ typing_extensions==4.11.0
79
+ tzdata==2024.1
80
+ urllib3==2.2.1
81
+ uvicorn==0.29.0
82
+ websockets==11.0.3