import gradio as gr from transformers import BertModel, BertConfig, BertTokenizer import torch import torch.nn as nn import torch.nn.functional as F import huggingface_hub from huggingface_hub import hf_hub_download import hazm from cleantext import clean import regex as re huggingface_hub.Repository = 'zArabi/Persian-Sentiment-Analysis' def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def cleaning(text): text = text.strip() # regular cleaning # https://pypi.org/project/clean-text/ >> works well for eng and de languages text = clean(text, fix_unicode=True, to_ascii=False, lower=True, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=False, no_digits=False, no_currency_symbols=True, no_punct=False, #Keep the punc replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="0", replace_with_currency_symbol="", ) # cleaning htmls text = cleanhtml(text) # normalizing > https://github.com/sobhe/hazm normalizer = hazm.Normalizer() text = normalizer.normalize(text) # removing wierd patterns wierd_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" u"\u2069" u"\u2066" # u"\u200c" u"\u2068" u"\u2067" "]+", flags=re.UNICODE) text = wierd_pattern.sub(r'', text) # removing extra spaces, hashtags text = re.sub("#", "", text) text = re.sub("\s+", " ", text) return text class SentimentModel(nn.Module): def __init__(self, config): super(SentimentModel, self).__init__() self.bert = BertModel.from_pretrained(modelName, return_dict=False) self.dropout = nn.Dropout(0.3) self.classifier = nn.Linear(config.hidden_size, config.num_labels) def forward(self, input_ids, attention_mask): _, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask) pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) return logits modelName = 'HooshvareLab/bert-fa-base-uncased' class_names = ['negative', 'neutral', 'positive'] label2id = {label: i for i, label in enumerate(class_names)} id2label = {v: k for k, v in label2id.items()} config = BertConfig.from_pretrained( modelName, num_labels=len(class_names), id2label=id2label, label2id=label2id) downloadedModelFile = hf_hub_download(repo_id="zArabi/Persian-Sentiment-Analysis", filename="persianModel") loaded_model = torch.load(downloadedModelFile,map_location="cpu") tokenizer = BertTokenizer.from_pretrained(modelName) max_len=512 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def predict(text): text = cleaning(text) encoding = tokenizer.encode_plus( text, max_length=max_len, truncation=True, padding="max_length", add_special_tokens=True, # Add '[CLS]' and '[SEP]' return_token_type_ids=True, return_attention_mask=True, return_tensors='pt', # Return PyTorch tensors ) input_ids = encoding["input_ids"].to(device) attention_mask = encoding["attention_mask"].to(device) outputs = loaded_model (input_ids, attention_mask) probs = F.softmax(outputs,dim=1) values, indices = torch.max(probs, dim=1) data = { 'comments': text, 'preds': indices.cpu().numpy()[0], 'label': class_names[indices.cpu().numpy()[0]], 'probablities': {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))} } return {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))} gr.Interface( predict, inputs=gr.Textbox(label="Explore your sentence!",lines=2, placeholder="Type Here..."), outputs=gr.outputs.Label(num_top_classes=3), title="How are feeling?!", ).launch()