zArabi's picture
Update app.py
1635166
raw
history blame
4.54 kB
import gradio as gr
from transformers import BertModel, BertConfig, BertTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import huggingface_hub
from huggingface_hub import hf_hub_download
import hazm
from cleantext import clean
import regex as re
huggingface_hub.Repository = 'zArabi/Persian-Sentiment-Analysis'
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def cleaning(text):
text = text.strip()
# regular cleaning
# https://pypi.org/project/clean-text/ >> works well for eng and de languages
text = clean(text,
fix_unicode=True,
to_ascii=False,
lower=True,
no_line_breaks=True,
no_urls=True,
no_emails=True,
no_phone_numbers=True,
no_numbers=False,
no_digits=False,
no_currency_symbols=True,
no_punct=False, #Keep the punc
replace_with_url="",
replace_with_email="",
replace_with_phone_number="",
replace_with_number="",
replace_with_digit="0",
replace_with_currency_symbol="",
)
# cleaning htmls
text = cleanhtml(text)
# normalizing > https://github.com/sobhe/hazm
normalizer = hazm.Normalizer()
text = normalizer.normalize(text)
# removing wierd patterns
wierd_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
u"\u2069"
u"\u2066"
# u"\u200c"
u"\u2068"
u"\u2067"
"]+", flags=re.UNICODE)
text = wierd_pattern.sub(r'', text)
# removing extra spaces, hashtags
text = re.sub("#", "", text)
text = re.sub("\s+", " ", text)
return text
class SentimentModel(nn.Module):
def __init__(self, config):
super(SentimentModel, self).__init__()
self.bert = BertModel.from_pretrained(modelName, return_dict=False)
self.dropout = nn.Dropout(0.3)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return logits
modelName = 'HooshvareLab/bert-fa-base-uncased'
class_names = ['negative', 'neutral', 'positive']
label2id = {label: i for i, label in enumerate(class_names)}
id2label = {v: k for k, v in label2id.items()}
config = BertConfig.from_pretrained(
modelName,
num_labels=len(class_names),
id2label=id2label,
label2id=label2id)
downloadedModelFile = hf_hub_download(repo_id="zArabi/Persian-Sentiment-Analysis", filename="persianModel")
loaded_model = torch.load(downloadedModelFile,map_location="cpu")
tokenizer = BertTokenizer.from_pretrained(modelName)
max_len=512
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def predict(text):
text = cleaning(text)
encoding = tokenizer.encode_plus(
text,
max_length=max_len,
truncation=True,
padding="max_length",
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
return_token_type_ids=True,
return_attention_mask=True,
return_tensors='pt', # Return PyTorch tensors
)
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
outputs = loaded_model (input_ids, attention_mask)
probs = F.softmax(outputs,dim=1)
values, indices = torch.max(probs, dim=1)
data = {
'comments': text,
'preds': indices.cpu().numpy()[0],
'label': class_names[indices.cpu().numpy()[0]],
'probablities': {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))}
}
return {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))}
gr.Interface(
predict,
inputs=gr.Textbox(label="Explore your sentence!",lines=2, placeholder="Type Here..."),
outputs=gr.outputs.Label(num_top_classes=3),
title="How are feeling?!",
).launch()