zArabi commited on
Commit
1635166
1 Parent(s): fdabac1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -6
app.py CHANGED
@@ -1,12 +1,86 @@
1
  import gradio as gr
2
- from transformers import BertModel, BertConfig
 
3
  import torch.nn as nn
4
  import torch.nn.functional as F
5
  import huggingface_hub
6
  from huggingface_hub import hf_hub_download
 
 
 
7
 
8
  huggingface_hub.Repository = 'zArabi/Persian-Sentiment-Analysis'
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  class SentimentModel(nn.Module):
11
  def __init__(self, config):
12
  super(SentimentModel, self).__init__()
@@ -34,16 +108,17 @@ config = BertConfig.from_pretrained(
34
  label2id=label2id)
35
 
36
  downloadedModelFile = hf_hub_download(repo_id="zArabi/Persian-Sentiment-Analysis", filename="persianModel")
37
- loaded_model = torch.load(downloadedModelFile)
38
 
39
- max_len=512
40
 
41
- pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
 
 
42
 
43
  def predict(text):
44
  text = cleaning(text)
45
  encoding = tokenizer.encode_plus(
46
- sample_text,
47
  max_length=max_len,
48
  truncation=True,
49
  padding="max_length",
@@ -58,7 +133,7 @@ def predict(text):
58
  probs = F.softmax(outputs,dim=1)
59
  values, indices = torch.max(probs, dim=1)
60
  data = {
61
- 'comments': sample_text,
62
  'preds': indices.cpu().numpy()[0],
63
  'label': class_names[indices.cpu().numpy()[0]],
64
  'probablities': {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))}
 
1
  import gradio as gr
2
+ from transformers import BertModel, BertConfig, BertTokenizer
3
+ import torch
4
  import torch.nn as nn
5
  import torch.nn.functional as F
6
  import huggingface_hub
7
  from huggingface_hub import hf_hub_download
8
+ import hazm
9
+ from cleantext import clean
10
+ import regex as re
11
 
12
  huggingface_hub.Repository = 'zArabi/Persian-Sentiment-Analysis'
13
 
14
+ def cleanhtml(raw_html):
15
+ cleanr = re.compile('<.*?>')
16
+ cleantext = re.sub(cleanr, '', raw_html)
17
+ return cleantext
18
+
19
+ def cleaning(text):
20
+ text = text.strip()
21
+
22
+ # regular cleaning
23
+ # https://pypi.org/project/clean-text/ >> works well for eng and de languages
24
+ text = clean(text,
25
+ fix_unicode=True,
26
+ to_ascii=False,
27
+ lower=True,
28
+ no_line_breaks=True,
29
+ no_urls=True,
30
+ no_emails=True,
31
+ no_phone_numbers=True,
32
+ no_numbers=False,
33
+ no_digits=False,
34
+ no_currency_symbols=True,
35
+ no_punct=False, #Keep the punc
36
+ replace_with_url="",
37
+ replace_with_email="",
38
+ replace_with_phone_number="",
39
+ replace_with_number="",
40
+ replace_with_digit="0",
41
+ replace_with_currency_symbol="",
42
+ )
43
+
44
+ # cleaning htmls
45
+ text = cleanhtml(text)
46
+
47
+ # normalizing > https://github.com/sobhe/hazm
48
+ normalizer = hazm.Normalizer()
49
+ text = normalizer.normalize(text)
50
+
51
+ # removing wierd patterns
52
+ wierd_pattern = re.compile("["
53
+ u"\U0001F600-\U0001F64F" # emoticons
54
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
55
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
56
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
57
+ u"\U00002702-\U000027B0"
58
+ u"\U000024C2-\U0001F251"
59
+ u"\U0001f926-\U0001f937"
60
+ u'\U00010000-\U0010ffff'
61
+ u"\u200d"
62
+ u"\u2640-\u2642"
63
+ u"\u2600-\u2B55"
64
+ u"\u23cf"
65
+ u"\u23e9"
66
+ u"\u231a"
67
+ u"\u3030"
68
+ u"\ufe0f"
69
+ u"\u2069"
70
+ u"\u2066"
71
+ # u"\u200c"
72
+ u"\u2068"
73
+ u"\u2067"
74
+ "]+", flags=re.UNICODE)
75
+
76
+ text = wierd_pattern.sub(r'', text)
77
+
78
+ # removing extra spaces, hashtags
79
+ text = re.sub("#", "", text)
80
+ text = re.sub("\s+", " ", text)
81
+
82
+ return text
83
+
84
  class SentimentModel(nn.Module):
85
  def __init__(self, config):
86
  super(SentimentModel, self).__init__()
 
108
  label2id=label2id)
109
 
110
  downloadedModelFile = hf_hub_download(repo_id="zArabi/Persian-Sentiment-Analysis", filename="persianModel")
111
+ loaded_model = torch.load(downloadedModelFile,map_location="cpu")
112
 
 
113
 
114
+ tokenizer = BertTokenizer.from_pretrained(modelName)
115
+ max_len=512
116
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
117
 
118
  def predict(text):
119
  text = cleaning(text)
120
  encoding = tokenizer.encode_plus(
121
+ text,
122
  max_length=max_len,
123
  truncation=True,
124
  padding="max_length",
 
133
  probs = F.softmax(outputs,dim=1)
134
  values, indices = torch.max(probs, dim=1)
135
  data = {
136
+ 'comments': text,
137
  'preds': indices.cpu().numpy()[0],
138
  'label': class_names[indices.cpu().numpy()[0]],
139
  'probablities': {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))}