Spaces:

Thushalya
/

AiLERT

Running

App Files Files Community

thushalya commited on May 7

Commit

861ab00

•

1 Parent(s): 7f0eec2

Add predicted_class as hate speech value

Browse files

Files changed (4) hide show

.gitignore +2 -0
app.py +368 -3
model.pt +3 -0
requirements.txt +82 -2

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /env
2	+ /*env

app.py CHANGED Viewed

@@ -1,7 +1,372 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
 demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel
+import re
+from textblob import TextBlob
+from nltk import pos_tag, word_tokenize
+from nltk.corpus import stopwords
+import emoji
+import string
+import nltk
+from nltk import pos_tag
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+import textstat
+import pandas as pd
+from transformers import pipeline
+from torch.utils.data import Dataset, DataLoader
+import torch.nn as nn
+#Loading author details
+def average_word_length(tweet):
+    words = tweet.split()
+    return sum(len(word) for word in words) / len(words)
+def lexical_diversity(tweet):
+    words = tweet.split()
+    unique_words = set(words)
+    return len(unique_words) / len(words)
+def count_capital_letters(tweet):
+    return sum(1 for char in tweet if char.isupper())
+def count_words_surrounded_by_colons(tweet):
+    # Define a regular expression pattern to match words surrounded by ':'
+    pattern = r':(\w+):'
+    # Use re.findall to find all matches in the tweet
+    matches = re.findall(pattern, tweet)
+    # Return the count of matched words
+    return len(matches)
+def count_emojis(tweet):
+    # Convert emoji symbols to their corresponding names
+    tweet_with_names = emoji.demojize(tweet)
+    return count_words_surrounded_by_colons(tweet_with_names)
+def hashtag_frequency(tweet):
+    hashtags = re.findall(r'#\w+', tweet)
+    return len(hashtags)
+def mention_frequency(tweet):
+    mentions = re.findall(r'@\w+', tweet)
+    return len(mentions)
+def count_special_characters(tweet):
+    special_characters = [char for char in tweet if char in string.punctuation]
+    return len(special_characters)
+def stop_word_frequency(tweet):
+    stop_words = set(stopwords.words('english'))
+    words = [word for word in tweet.split() if word.lower() in stop_words]
+    return len(words)
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('stopwords')
+def get_linguistic_features(tweet):
+    # Tokenize the tweet
+    words = word_tokenize(tweet)
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
+    # Get parts of speech tags
+    pos_tags = pos_tag(filtered_words)
+    # Count various linguistic features
+    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N'))
+    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V'))
+    participle_count = sum(1 for word, pos in pos_tags if pos.startswith('V') and ('ing' in word or 'ed' in word))
+    interjection_count = sum(1 for word, pos in pos_tags if pos == 'UH')
+    pronoun_count = sum(1 for word, pos in pos_tags if pos.startswith('PRP'))
+    preposition_count = sum(1 for word, pos in pos_tags if pos.startswith('IN'))
+    adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB'))
+    conjunction_count = sum(1 for word, pos in pos_tags if pos.startswith('CC'))
+    return {
+        'Noun_Count': noun_count,
+        'Verb_Count': verb_count,
+        'Participle_Count': participle_count,
+        'Interjection_Count': interjection_count,
+        'Pronoun_Count': pronoun_count,
+        'Preposition_Count': preposition_count,
+        'Adverb_Count': adverb_count,
+        'Conjunction_Count': conjunction_count
+    }
+def readability_score(tweet):
+    return textstat.flesch_reading_ease(tweet)
+def get_url_frequency(tweet):
+    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet)
+    return len(urls)
+# Define a function to extract features from a single tweet
+def extract_features(tweet):
+    features = {
+        'Average_Word_Length': average_word_length(tweet),
+        # 'Average_Sentence_Length': average_sentence_length(tweet),
+        'Lexical_Diversity': lexical_diversity(tweet),
+        'Capital_Letters_Count': count_capital_letters(tweet),  # Uncomment if you want to include this feature
+        'Hashtag_Frequency': hashtag_frequency(tweet),
+        'Mention_Frequency': mention_frequency(tweet),
+        'count_emojis': count_emojis(tweet),
+        'special_chars_count': count_special_characters(tweet),
+        'Stop_Word_Frequency': stop_word_frequency(tweet),
+        **get_linguistic_features(tweet),  # Include linguistic features
+        'Readability_Score': readability_score(tweet),
+        'URL_Frequency': get_url_frequency(tweet)  # Assuming you have the correct function for this
+    }
+    return features
+# # Extract features for all tweets
+# features_list = [extract_features(tweet) for tweet in X['text']]
+# # Create a Pandas DataFrame
+# X_new = pd.DataFrame(features_list)
+# Loading personality model
+def personality_detection(text, threshold=0.05, endpoint= 1.0):
+    tokenizer = AutoTokenizer.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN)
+    model = AutoModelForSequenceClassification.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN)
+    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
+    outputs = model(**inputs)
+    predictions = outputs.logits.squeeze().detach().numpy()
+    # Get raw logits
+    logits = model(**inputs).logits
+    # Apply sigmoid to squash between 0 and 1
+    probabilities = torch.sigmoid(logits)
+    # # Set values less than the threshold to 0.05
+    # predictions[predictions < threshold] = 0.05
+    # predictions[predictions > endpoint] = 1.0
+    # print("per",probabilities[0][0].detach().numpy())
+    # print("per",probabilities[0][1].detach().numpy())
+    # print("per",probabilities[0][2].detach().numpy())
+    # print("per",probabilities[0][3].detach().numpy())
+    # print("per",probabilities[0][4].detach().numpy())
+    # label_names = ['Agreeableness', 'Conscientiousness', 'Extraversion', 'Neuroticism', 'Openness']
+    # # result = {label_names[i]: f"{predictions[i]*100:.0f}%" for i in range(len(label_names))}
+    # result = {label_names[i]: f"{probabilities}%" for i in range(len(label_names))}
+    # probabilities
+    return [probabilities[0][0].detach().numpy()
+            ,probabilities[0][1].detach().numpy()
+            ,probabilities[0][2].detach().numpy()
+            ,probabilities[0][3].detach().numpy()
+            ,probabilities[0][4].detach().numpy()]
+# tokenizer = AutoTokenizer.from_pretrained("Nasserelsaman/microsoft-finetuned-personality")
+# model = AutoModelForSequenceClassification.from_pretrained("Nasserelsaman/microsoft-finetuned-personality")
+#Loading emotion model
+# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")
+# model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")
+##use this for gpu
+# pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True,device=device )
+##use this for cpu
+def calc_emotion_score(tweet):
+    pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True )
+    emotions = pipe(tweet)[0]
+    for i in emotions:
+        print(i)
+    return [emotions[0]['score'],emotions[1]['score'],emotions[2]['score'],emotions[3]['score'],emotions[4]['score'],emotions[5]['score'],emotions[6]['score'],emotions[7]['score'],emotions[8]['score'],emotions[9]['score'],emotions[10]['score']]
+#DCL model launching
+def load_model(tweet):
+    # model = torch.load("./authormodel.pt",map_location ='cpu')
+    # print(model)
+    model_name = "vinai/bertweet-base"
+    PADDING_MAX_LENGTH = 45
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    inputs = tokenizer(tweet, truncation=True, padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True, return_tensors="pt")
+    print(inputs)
+    emotion_list = calc_emotion_score(tweet)
+    print(emotion_list)
+    features_list = extract_features(tweet)
+    for i in features_list.values():
+        emotion_list.append(i)
+    print("emotion + author",emotion_list)
+    # print()
+    # print(features_list)
+    personality_list = personality_detection(tweet)
+    print("personality",personality_list)
+    # person_list = [personality_list["Extraversion"],personality_list['Neuroticism'],personality_list['Agreeableness'],personality_list['Conscientiousness'],personality_list['Openness']]
+    emotion_list.extend(personality_list)
+    print("final list",emotion_list)
+    # print(str(features_list["Average_Word_Length"]))
+    inputs['emotion_author_vector'] =  torch.tensor([emotion_list])
+    print("final inputs    ",inputs)
+    # []
+    # inputs["emotion_author_vector"] =
+    # train_dataloader=DataLoader(inputs, batch_size=1 , shuffle=False)
+    # print(train_dataloader)
+    device = torch.device("cuda:0"  if torch.cuda.is_available() else "cpu")
+    # def tokenize_function(examples):
+    #     return tokenizer.batch_encode_plus(examples["text"], padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True,truncation=True)
+    class EmotionAuthorGuidedDCLModel(nn.Module):
+        def __init__(self,dcl_model:nn.Module,dropout:float=0.5):
+            super(EmotionAuthorGuidedDCLModel, self).__init__()
+            self.dcl_model = dcl_model
+            self.dim = 802
+            self.dropout = nn.Dropout(dropout)
+            self.linear = nn.Linear(self.dim, 1)
+            # Freeze all layers
+            for param in self.dcl_model.parameters():
+                param.requires_grad = False
+        def forward(self,batch_tokenized):
+            input_ids = batch_tokenized['input_ids']
+            attention_mask = batch_tokenized['attention_mask']
+            emotion_vector = batch_tokenized['emotion_author_vector']
+            bert_output = self.dcl_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
+            bert_cls_hidden_state = bert_output[1]
+            combined_vector =torch.cat((bert_cls_hidden_state,emotion_vector), 1)
+            d_combined_vector=self.dropout(combined_vector)
+            linear_output = self.linear(d_combined_vector)
+            pred_linear = linear_output.squeeze(1)
+            return pred_linear
+    # twee
+    checkpoint = {
+        "model_state_dict":torch.load("./model.pt",map_location ='cpu') ,
+    }
+    # checkpoint=load_checkpoint(run=run_dcl_study,check_point_name="model_checkpoints/")
+    class DCLArchitecture(nn.Module):
+        def __init__(self,dropout:float,bert_model_name:str='vinai/bertweet-base'):
+            super(DCLArchitecture, self).__init__()
+            self.bert = AutoModel.from_pretrained(bert_model_name)
+            self.dim = 768
+            self.dense = nn.Linear(self.dim, 1)
+            self.dropout = nn.Dropout(dropout)
+        def forward(self,batch_tokenized, if_train=False):
+            input_ids = batch_tokenized['input_ids']
+            attention_mask = batch_tokenized['attention_mask']
+            bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
+            bert_cls_hidden_state = bert_output[1]
+            torch.cuda.empty_cache()
+            if if_train:
+                bert_cls_hidden_state_aug = self.dropout(bert_cls_hidden_state)
+                bert_cls_hidden_state = torch.cat((bert_cls_hidden_state, bert_cls_hidden_state_aug), dim=1).reshape(-1, self.dim)
+            else:
+                bert_cls_hidden_state = self.dropout(bert_cls_hidden_state)
+            linear_output = self.dense(bert_cls_hidden_state)
+            linear_output = linear_output.squeeze(1)
+            return bert_cls_hidden_state, linear_output
+    # dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=best_prams["DROPOUT"])
+    dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=0.5)
+    dcl_model.to(device)
+    DROPOUT = 0.5
+    fined_tuned_bert_model=dcl_model.bert
+    model = EmotionAuthorGuidedDCLModel(dcl_model=fined_tuned_bert_model,dropout=DROPOUT)
+    model.to(device)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    # def test_loop(model, test_dataloader, device):
+    # # collection_metric = MetricCollection(
+    # #       BinaryAccuracy(),
+    # #       MulticlassPrecision(num_classes=2,average=average),
+    # #       MulticlassRecall(num_classes=2,average=average),
+    # #       MulticlassF1Score(num_classes=2,average=average),
+    # #       BinaryConfusionMatrix()
+    # # )
+    # # collection_metric.to(device)
+    #     model.eval()
+    #     print(test_dataloader)
+    #     # total_test_loss = 0.0
+    #     for batch in test_dataloader:
+    #         print(batch)
+    #         batch = {k: v.to(device) for k, v in batch.items()}
+    #         # labels = batch["labels"]
+    #         with torch.no_grad():
+    #             pred = model(batch)
+    #             # loss = criteon(pred, labels.float())
+    #             pred = torch.round(torch.sigmoid(pred))
+    #     return pred
+    # result_metrics=test_loop(model=model, test_dataloader=train_dataloader,device=device)
+    # print("Hate speech result",result_metrics)
+    def predict_single_text(model, inputs,device):
+        # Preprocess the text
+        # inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Pass the preprocessed text through the model
+        with torch.no_grad():
+            model.eval()
+            pred = model(inputs)
+            # Assuming your model returns a single value for prediction
+            pred = torch.round(torch.sigmoid(pred)).item()
+        return pred
+    predicted_class = predict_single_text(model, inputs, device)
+    return predicted_class
+    # print("Hate speech result",predicted_class)
+#Gradio interface
+def greet(tweet):
+    print("start")
+    predicted_class = load_model(tweet)
+    # features_list = extract_features(tweet)
+    # print(personality_detection(tweet))
+    # print(str(features_list["Average_Word_Length"]))
+    # print(calc_emotion_score(tweet))
+    print("end")
+    return str(predicted_class)
 demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+demo.launch()

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a0522ff50dd3433230896898665a1b3a8d5fbaf72f5c2f6286a51e267f56b45
+size 539673670

requirements.txt CHANGED Viewed

@@ -1,2 +1,82 @@
-gradio
-torch

+aiofiles==23.2.1
+altair==5.3.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.1
+cycler==0.12.1
+emoji==2.11.1
+fastapi==0.110.3
+ffmpy==0.3.2
+filelock==3.14.0
+fonttools==4.51.0
+fsspec==2024.3.1
+gradio==4.28.3
+gradio_client==0.16.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.22.2
+idna==3.7
+importlib_resources==6.4.0
+intel-openmp==2021.4.0
+Jinja2==3.1.3
+joblib==1.4.0
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.8.4
+mdurl==0.1.2
+mkl==2021.4.0
+mpmath==1.3.0
+networkx==3.3
+nltk==3.8.1
+numpy==1.26.4
+orjson==3.10.2
+packaging==24.0
+pandas==2.2.2
+pillow==10.3.0
+pydantic==2.7.1
+pydantic_core==2.18.2
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.2
+pyphen==0.15.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.0
+regex==2024.4.28
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+ruff==0.4.2
+safetensors==0.4.3
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.37.2
+sympy==1.12
+tbb==2021.12.0
+textblob==0.18.0.post0
+textstat==0.7.3
+tokenizers==0.19.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.3.0
+tqdm==4.66.2
+transformers==4.40.1
+typer==0.12.3
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.29.0
+websockets==11.0.3