Spaces:

Thushalya
/

AiLERT

Running

AiLERT / app.py

thushalya

Add predicted_class as hate speech value

861ab00 7 months ago

14.4 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel
	import re
	from textblob import TextBlob
	from nltk import pos_tag, word_tokenize
	from nltk.corpus import stopwords
	import emoji
	import string
	import nltk
	from nltk import pos_tag
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	import textstat
	import pandas as pd
	from transformers import pipeline
	from torch.utils.data import Dataset, DataLoader
	import torch.nn as nn







	#Loading author details
	def average_word_length(tweet):
	words = tweet.split()
	return sum(len(word) for word in words) / len(words)


	def lexical_diversity(tweet):
	words = tweet.split()
	unique_words = set(words)
	return len(unique_words) / len(words)

	def count_capital_letters(tweet):
	return sum(1 for char in tweet if char.isupper())

	def count_words_surrounded_by_colons(tweet):
	# Define a regular expression pattern to match words surrounded by ':'
	pattern = r':(\w+):'

	# Use re.findall to find all matches in the tweet
	matches = re.findall(pattern, tweet)

	# Return the count of matched words
	return len(matches)

	def count_emojis(tweet):
	# Convert emoji symbols to their corresponding names
	tweet_with_names = emoji.demojize(tweet)
	return count_words_surrounded_by_colons(tweet_with_names)

	def hashtag_frequency(tweet):
	hashtags = re.findall(r'#\w+', tweet)
	return len(hashtags)

	def mention_frequency(tweet):
	mentions = re.findall(r'@\w+', tweet)
	return len(mentions)

	def count_special_characters(tweet):
	special_characters = [char for char in tweet if char in string.punctuation]
	return len(special_characters)


	def stop_word_frequency(tweet):
	stop_words = set(stopwords.words('english'))
	words = [word for word in tweet.split() if word.lower() in stop_words]
	return len(words)

	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('stopwords')

	def get_linguistic_features(tweet):
	# Tokenize the tweet
	words = word_tokenize(tweet)

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

	# Get parts of speech tags
	pos_tags = pos_tag(filtered_words)

	# Count various linguistic features
	noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N'))
	verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V'))
	participle_count = sum(1 for word, pos in pos_tags if pos.startswith('V') and ('ing' in word or 'ed' in word))
	interjection_count = sum(1 for word, pos in pos_tags if pos == 'UH')
	pronoun_count = sum(1 for word, pos in pos_tags if pos.startswith('PRP'))
	preposition_count = sum(1 for word, pos in pos_tags if pos.startswith('IN'))
	adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB'))
	conjunction_count = sum(1 for word, pos in pos_tags if pos.startswith('CC'))

	return {
	'Noun_Count': noun_count,
	'Verb_Count': verb_count,
	'Participle_Count': participle_count,
	'Interjection_Count': interjection_count,
	'Pronoun_Count': pronoun_count,
	'Preposition_Count': preposition_count,
	'Adverb_Count': adverb_count,
	'Conjunction_Count': conjunction_count
	}

	def readability_score(tweet):
	return textstat.flesch_reading_ease(tweet)

	def get_url_frequency(tweet):
	urls = re.findall(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet)
	return len(urls)


	# Define a function to extract features from a single tweet
	def extract_features(tweet):
	features = {
	'Average_Word_Length': average_word_length(tweet),
	# 'Average_Sentence_Length': average_sentence_length(tweet),
	'Lexical_Diversity': lexical_diversity(tweet),
	'Capital_Letters_Count': count_capital_letters(tweet), # Uncomment if you want to include this feature
	'Hashtag_Frequency': hashtag_frequency(tweet),
	'Mention_Frequency': mention_frequency(tweet),
	'count_emojis': count_emojis(tweet),
	'special_chars_count': count_special_characters(tweet),
	'Stop_Word_Frequency': stop_word_frequency(tweet),
	**get_linguistic_features(tweet), # Include linguistic features
	'Readability_Score': readability_score(tweet),
	'URL_Frequency': get_url_frequency(tweet) # Assuming you have the correct function for this
	}
	return features

	# # Extract features for all tweets
	# features_list = [extract_features(tweet) for tweet in X['text']]

	# # Create a Pandas DataFrame
	# X_new = pd.DataFrame(features_list)



	# Loading personality model

	def personality_detection(text, threshold=0.05, endpoint= 1.0):
	tokenizer = AutoTokenizer.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN)
	model = AutoModelForSequenceClassification.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN)

	inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
	outputs = model(**inputs)
	predictions = outputs.logits.squeeze().detach().numpy()

	# Get raw logits
	logits = model(**inputs).logits

	# Apply sigmoid to squash between 0 and 1
	probabilities = torch.sigmoid(logits)

	# # Set values less than the threshold to 0.05
	# predictions[predictions < threshold] = 0.05
	# predictions[predictions > endpoint] = 1.0
	# print("per",probabilities[0][0].detach().numpy())
	# print("per",probabilities[0][1].detach().numpy())
	# print("per",probabilities[0][2].detach().numpy())
	# print("per",probabilities[0][3].detach().numpy())
	# print("per",probabilities[0][4].detach().numpy())

	# label_names = ['Agreeableness', 'Conscientiousness', 'Extraversion', 'Neuroticism', 'Openness']
	# # result = {label_names[i]: f"{predictions[i]*100:.0f}%" for i in range(len(label_names))}
	# result = {label_names[i]: f"{probabilities}%" for i in range(len(label_names))}
	# probabilities
	return [probabilities[0][0].detach().numpy()
	,probabilities[0][1].detach().numpy()
	,probabilities[0][2].detach().numpy()
	,probabilities[0][3].detach().numpy()
	,probabilities[0][4].detach().numpy()]


	# tokenizer = AutoTokenizer.from_pretrained("Nasserelsaman/microsoft-finetuned-personality")
	# model = AutoModelForSequenceClassification.from_pretrained("Nasserelsaman/microsoft-finetuned-personality")

	#Loading emotion model

	# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")
	# model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")

	##use this for gpu
	# pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True,device=device )

	##use this for cpu
	def calc_emotion_score(tweet):
	pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True )
	emotions = pipe(tweet)[0]
	for i in emotions:
	print(i)

	return [emotions[0]['score'],emotions[1]['score'],emotions[2]['score'],emotions[3]['score'],emotions[4]['score'],emotions[5]['score'],emotions[6]['score'],emotions[7]['score'],emotions[8]['score'],emotions[9]['score'],emotions[10]['score']]






	#DCL model launching

	def load_model(tweet):
	# model = torch.load("./authormodel.pt",map_location ='cpu')
	# print(model)

	model_name = "vinai/bertweet-base"
	PADDING_MAX_LENGTH = 45
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	inputs = tokenizer(tweet, truncation=True, padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True, return_tensors="pt")
	print(inputs)
	emotion_list = calc_emotion_score(tweet)
	print(emotion_list)

	features_list = extract_features(tweet)
	for i in features_list.values():
	emotion_list.append(i)
	print("emotion + author",emotion_list)
	# print()
	# print(features_list)
	personality_list = personality_detection(tweet)
	print("personality",personality_list)
	# person_list = [personality_list["Extraversion"],personality_list['Neuroticism'],personality_list['Agreeableness'],personality_list['Conscientiousness'],personality_list['Openness']]
	emotion_list.extend(personality_list)
	print("final list",emotion_list)
	# print(str(features_list["Average_Word_Length"]))
	inputs['emotion_author_vector'] = torch.tensor([emotion_list])

	print("final inputs ",inputs)


	# []
	# inputs["emotion_author_vector"] =
	# train_dataloader=DataLoader(inputs, batch_size=1 , shuffle=False)
	# print(train_dataloader)
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	# def tokenize_function(examples):
	# return tokenizer.batch_encode_plus(examples["text"], padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True,truncation=True)
	class EmotionAuthorGuidedDCLModel(nn.Module):
	def __init__(self,dcl_model:nn.Module,dropout:float=0.5):
	super(EmotionAuthorGuidedDCLModel, self).__init__()
	self.dcl_model = dcl_model
	self.dim = 802
	self.dropout = nn.Dropout(dropout)
	self.linear = nn.Linear(self.dim, 1)
	# Freeze all layers
	for param in self.dcl_model.parameters():
	param.requires_grad = False

	def forward(self,batch_tokenized):
	input_ids = batch_tokenized['input_ids']
	attention_mask = batch_tokenized['attention_mask']
	emotion_vector = batch_tokenized['emotion_author_vector']
	bert_output = self.dcl_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
	bert_cls_hidden_state = bert_output[1]
	combined_vector =torch.cat((bert_cls_hidden_state,emotion_vector), 1)
	d_combined_vector=self.dropout(combined_vector)
	linear_output = self.linear(d_combined_vector)
	pred_linear = linear_output.squeeze(1)
	return pred_linear
	# twee

	checkpoint = {
	"model_state_dict":torch.load("./model.pt",map_location ='cpu') ,
	}

	# checkpoint=load_checkpoint(run=run_dcl_study,check_point_name="model_checkpoints/")

	class DCLArchitecture(nn.Module):
	def __init__(self,dropout:float,bert_model_name:str='vinai/bertweet-base'):
	super(DCLArchitecture, self).__init__()
	self.bert = AutoModel.from_pretrained(bert_model_name)
	self.dim = 768
	self.dense = nn.Linear(self.dim, 1)
	self.dropout = nn.Dropout(dropout)

	def forward(self,batch_tokenized, if_train=False):
	input_ids = batch_tokenized['input_ids']
	attention_mask = batch_tokenized['attention_mask']
	bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
	bert_cls_hidden_state = bert_output[1]
	torch.cuda.empty_cache()

	if if_train:
	bert_cls_hidden_state_aug = self.dropout(bert_cls_hidden_state)
	bert_cls_hidden_state = torch.cat((bert_cls_hidden_state, bert_cls_hidden_state_aug), dim=1).reshape(-1, self.dim)
	else:
	bert_cls_hidden_state = self.dropout(bert_cls_hidden_state)

	linear_output = self.dense(bert_cls_hidden_state)
	linear_output = linear_output.squeeze(1)

	return bert_cls_hidden_state, linear_output


	# dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=best_prams["DROPOUT"])
	dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=0.5)
	dcl_model.to(device)

	DROPOUT = 0.5
	fined_tuned_bert_model=dcl_model.bert
	model = EmotionAuthorGuidedDCLModel(dcl_model=fined_tuned_bert_model,dropout=DROPOUT)
	model.to(device)
	model.load_state_dict(checkpoint["model_state_dict"])




	# def test_loop(model, test_dataloader, device):
	# # collection_metric = MetricCollection(
	# # BinaryAccuracy(),
	# # MulticlassPrecision(num_classes=2,average=average),
	# # MulticlassRecall(num_classes=2,average=average),
	# # MulticlassF1Score(num_classes=2,average=average),
	# # BinaryConfusionMatrix()
	# # )
	# # collection_metric.to(device)
	# model.eval()
	# print(test_dataloader)
	# # total_test_loss = 0.0
	# for batch in test_dataloader:
	# print(batch)
	# batch = {k: v.to(device) for k, v in batch.items()}
	# # labels = batch["labels"]
	# with torch.no_grad():
	# pred = model(batch)
	# # loss = criteon(pred, labels.float())
	# pred = torch.round(torch.sigmoid(pred))

	# return pred
	# result_metrics=test_loop(model=model, test_dataloader=train_dataloader,device=device)
	# print("Hate speech result",result_metrics)

	def predict_single_text(model, inputs,device):
	# Preprocess the text
	# inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Pass the preprocessed text through the model
	with torch.no_grad():
	model.eval()
	pred = model(inputs)
	# Assuming your model returns a single value for prediction
	pred = torch.round(torch.sigmoid(pred)).item()

	return pred

	predicted_class = predict_single_text(model, inputs, device)
	return predicted_class
	# print("Hate speech result",predicted_class)




	#Gradio interface
	def greet(tweet):
	print("start")
	predicted_class = load_model(tweet)
	# features_list = extract_features(tweet)
	# print(personality_detection(tweet))
	# print(str(features_list["Average_Word_Length"]))
	# print(calc_emotion_score(tweet))
	print("end")


	return str(predicted_class)

	demo = gr.Interface(fn=greet, inputs="text", outputs="text")
	demo.launch()