rsakadewa7 commited on
Commit
d1daaca
·
verified ·
1 Parent(s): 604a722

Update predict.py

Browse files
Files changed (1) hide show
  1. predict.py +213 -205
predict.py CHANGED
@@ -1,206 +1,214 @@
1
- # Import libraries
2
- import logging
3
- import re
4
- import pandas as pd
5
- import numpy as np
6
- import tensorflow as tf
7
- import nltk
8
- from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
9
- from transformers import AutoTokenizer, TFBertModel
10
- from tensorflow.keras import backend as K
11
- from tensorflow.keras.models import load_model
12
- from tensorflow.keras.layers import Layer
13
- from tensorflow_addons.optimizers import AdamW
14
- import streamlit as st
15
- from nltk.corpus import stopwords
16
- from concurrent.futures import ThreadPoolExecutor
17
- import kagglehub
18
- import os
19
-
20
- # Text Processing
21
- nltk.download('punkt')
22
- nltk.download('stopwords')
23
-
24
- # Logging configuration
25
- logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
26
-
27
- # Load dataset
28
- def load_dataset():
29
- try:
30
- path = kagglehub.dataset_download("dannytheodore/brimo-app-review")
31
- dataset_path = f"{path}/brimo_googleplaystore_review.csv"
32
- return pd.read_csv(dataset_path, index_col=0)
33
- except Exception as e:
34
- logging.error(f"Error loading dataset: {e}")
35
- st.error("Failed to load the dataset.")
36
- return None
37
-
38
- # Map the labels to positive, neutral, negative
39
- def map_labels(score):
40
- if score >= 4:
41
- return 2 # Positive
42
- elif score == 3:
43
- return 1 # Neutral
44
- else:
45
- return 0 # Negative
46
-
47
- # Preprocess text
48
- def preprocess_text(text, stop_words, stemmer):
49
- try:
50
- text = text.lower()
51
- text = re.sub(r"@[A-Za-z0-9_]+", " ", text) # Remove mentions
52
- text = re.sub(r"#[A-Za-z0-9_]+", " ", text) # Remove hashtags
53
- text = re.sub(r"http\S+", " ", text) # Remove URLs
54
- text = re.sub(r"www.\S+", " ", text) # Remove www URLs
55
- text = re.sub(r"[^A-Za-z\s']", " ", text) # Remove non-letter characters
56
- tokens = text.split()
57
- tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
58
- tokens = [stemmer.stem(word) for word in tokens] # Apply stemming
59
- return ' '.join(tokens)
60
- except Exception as e:
61
- logging.error(f"Error processing text: {text}\n{e}")
62
- return text
63
-
64
- # Preprocess and tokenize reviews asynchronously
65
- def preprocess_and_tokenize_reviews(reviews, tokenizer, stop_words, stemmer, max_length=128):
66
- with ThreadPoolExecutor() as executor:
67
- cleaned_reviews = list(executor.map(lambda x: preprocess_text(x, stop_words, stemmer), reviews))
68
- return tokenizer(cleaned_reviews, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
69
-
70
- # Custom Keras Layer
71
- class BertLayer(Layer):
72
- def __init__(self, base_model, **kwargs):
73
- super(BertLayer, self).__init__(**kwargs)
74
- self.base_model = base_model
75
-
76
- def call(self, inputs):
77
- input_ids, attention_mask = inputs
78
- outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
79
- return outputs.last_hidden_state
80
-
81
- def get_config(self):
82
- config = super(BertLayer, self).get_config()
83
- config.update({"base_model": self.base_model})
84
- return config
85
-
86
- # Add Pooler Layer (from the first [CLS] token)
87
- class PoolerLayer(Layer):
88
- def __init__(self, **kwargs):
89
- super(PoolerLayer, self).__init__(**kwargs)
90
-
91
- def call(self, inputs):
92
- cls_token = inputs[:, 0, :] # First token's output (the [CLS] token)
93
- pooled_output = tf.keras.activations.tanh(cls_token) # Apply tanh activation
94
- return pooled_output
95
-
96
- # Custom F1 Score Metric
97
- class F1Score(tf.keras.metrics.Metric):
98
- def __init__(self, name="f1_score", **kwargs):
99
- super(F1Score, self).__init__(name=name, **kwargs)
100
- self.true_positives = self.add_weight(name="tp", initializer="zeros")
101
- self.false_positives = self.add_weight(name="fp", initializer="zeros")
102
- self.false_negatives = self.add_weight(name="fn", initializer="zeros")
103
-
104
- def update_state(self, y_true, y_pred, sample_weight=None):
105
- y_pred = tf.argmax(y_pred, axis=-1)
106
- y_true = tf.argmax(y_true, axis=-1)
107
- tp = tf.reduce_sum(tf.cast((y_true == y_pred) & (y_true != 0), tf.float32))
108
- fp = tf.reduce_sum(tf.cast((y_true != y_pred) & (y_pred != 0), tf.float32))
109
- fn = tf.reduce_sum(tf.cast((y_true != y_pred) & (y_true != 0), tf.float32))
110
- self.true_positives.assign_add(tp)
111
- self.false_positives.assign_add(fp)
112
- self.false_negatives.assign_add(fn)
113
-
114
- def result(self):
115
- precision = self.true_positives / (self.true_positives + self.false_positives + K.epsilon())
116
- recall = self.true_positives / (self.true_positives + self.false_negatives + K.epsilon())
117
- f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
118
- return f1
119
-
120
- def reset_state(self):
121
- self.true_positives.assign(0)
122
- self.false_positives.assign(0)
123
- self.false_negatives.assign(0)
124
-
125
- # Load model and tokenizer
126
- def load_model_and_tokenizer():
127
- try:
128
- model_path = 'best_model.h5'
129
- if os.path.exists(model_path):
130
- model = load_model(model_path, custom_objects={'TFBertModel': TFBertModel, 'BertLayer': BertLayer, 'PoolerLayer': PoolerLayer, 'F1Score': F1Score})
131
- else:
132
- st.error("Model file not found. Please check the file path.")
133
- return None, None
134
- except Exception as e:
135
- logging.error(f"Error loading model: {e}")
136
- st.error("Failed to load the model. Please check the model file and try again.")
137
- return None, None
138
-
139
- # Recreate the AdamW optimizer
140
- optimizer = AdamW(learning_rate=2e-5, weight_decay=1e-5)
141
-
142
- # Recompile the model with the AdamW optimizer
143
- model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[F1Score()])
144
-
145
- # Load tokenizer from the tokenizer folder
146
- try:
147
- tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
148
- except Exception as e:
149
- logging.error(f"Error loading tokenizer: {e}")
150
- st.error("Failed to load the tokenizer. Please check the tokenizer files.")
151
- return None, None
152
-
153
- return model, tokenizer
154
-
155
-
156
- # Sentiment mapping
157
- sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
158
-
159
- # Run Streamlit app
160
- def run(model, tokenizer, stop_words, stemmer):
161
- # Set title and description
162
- st.title('Sentiment Analysis using IndoBERT')
163
- st.subheader('This application analyzes the sentiment of user-provided reviews.')
164
-
165
- # Input form
166
- with st.form(key='review_form'):
167
- review_input = st.text_area("Enter Review:", height=150)
168
- submit_button = st.form_submit_button("Analyze Sentiment")
169
-
170
- if submit_button:
171
- if review_input:
172
- # Preprocess and tokenize the review
173
- tokenized_review = preprocess_and_tokenize_reviews([review_input], tokenizer, stop_words, stemmer)
174
-
175
- # Make prediction
176
- if model:
177
- predictions = model.predict({'input_ids': tokenized_review['input_ids'], 'attention_mask': tokenized_review['attention_mask']})
178
- predicted_label = np.argmax(predictions, axis=-1)
179
- sentiment = sentiment_map[predicted_label[0]]
180
- st.write(f"### Predicted Sentiment: {sentiment}")
181
- else:
182
- st.error("Model is not loaded. Please check the model file and try again.")
183
- else:
184
- st.error("Please enter a review to analyze.")
185
-
186
- if __name__ == "__main__":
187
- # Load necessary components
188
- df = load_dataset()
189
- model, tokenizer = load_model_and_tokenizer()
190
-
191
- if df is not None and model is not None and tokenizer is not None:
192
- # Preprocess dataset and prepare stopwords and stemmer
193
- manual_stopwords = ["di", "ke", "dari", "yang", "dan", "atau", "dengan", "untuk", "ini", "itu", "aja", "saja", "lah", "bri", "brimo", "aplikasi", "rekening", "coba", "yg", "ke", "untuk", "nya", "saya", "dia", "dan", "sangat", "video", "login", "apk", "jadi", "akun", "malah", "uang", "banget", "dalam", "atm", "padahal"]
194
- stop_words = set(stopwords.words('indonesian'))
195
- stop_words.update(manual_stopwords)
196
- factory = StemmerFactory()
197
- stemmer = factory.create_stemmer()
198
- df['label'] = df['score'].apply(map_labels)
199
- run(model, tokenizer, stop_words, stemmer)
200
- else:
201
- if df is None:
202
- logging.error("Failed to load dataset.")
203
- st.error("Failed to load the dataset. Please check the dataset file.")
204
- if model is None or tokenizer is None:
205
- logging.error("Failed to load model or tokenizer.")
 
 
 
 
 
 
 
 
206
  st.error("Failed to load the model or tokenizer. Please check the model file.")
 
1
+ # Import libraries
2
+ import logging
3
+ import re
4
+ import pandas as pd
5
+ import numpy as np
6
+ import tensorflow as tf
7
+ import nltk
8
+ from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
9
+ from transformers import AutoTokenizer, TFBertModel
10
+ from tensorflow.keras import backend as K
11
+ from tensorflow.keras.models import load_model
12
+ from tensorflow.keras.layers import Layer
13
+ from tensorflow_addons.optimizers import AdamW
14
+ import streamlit as st
15
+ from nltk.corpus import stopwords
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ import kagglehub
18
+ import os
19
+
20
+ # Text Processing
21
+ nltk.download('punkt')
22
+ nltk.download('stopwords')
23
+
24
+ # Logging configuration
25
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
26
+
27
+ # Load dataset
28
+ def load_dataset():
29
+ try:
30
+ path = kagglehub.dataset_download("dannytheodore/brimo-app-review")
31
+ dataset_path = f"{path}/brimo_googleplaystore_review.csv"
32
+ return pd.read_csv(dataset_path, index_col=0)
33
+ except Exception as e:
34
+ logging.error(f"Error loading dataset: {e}")
35
+ st.error("Failed to load the dataset.")
36
+ return None
37
+
38
+ # Map the labels to positive, neutral, negative
39
+ def map_labels(score):
40
+ if score >= 4:
41
+ return 2 # Positive
42
+ elif score == 3:
43
+ return 1 # Neutral
44
+ else:
45
+ return 0 # Negative
46
+
47
+ # Preprocess text
48
+ def preprocess_text(text, stop_words, stemmer):
49
+ try:
50
+ text = text.lower()
51
+ text = re.sub(r"@[A-Za-z0-9_]+", " ", text) # Remove mentions
52
+ text = re.sub(r"#[A-Za-z0-9_]+", " ", text) # Remove hashtags
53
+ text = re.sub(r"http\S+", " ", text) # Remove URLs
54
+ text = re.sub(r"www.\S+", " ", text) # Remove www URLs
55
+ text = re.sub(r"[^A-Za-z\s']", " ", text) # Remove non-letter characters
56
+ tokens = text.split()
57
+ tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
58
+ tokens = [stemmer.stem(word) for word in tokens] # Apply stemming
59
+ return ' '.join(tokens)
60
+ except Exception as e:
61
+ logging.error(f"Error processing text: {text}\n{e}")
62
+ return text
63
+
64
+ # Preprocess and tokenize reviews asynchronously
65
+ def preprocess_and_tokenize_reviews(reviews, tokenizer, stop_words, stemmer, max_length=128):
66
+ with ThreadPoolExecutor() as executor:
67
+ cleaned_reviews = list(executor.map(lambda x: preprocess_text(x, stop_words, stemmer), reviews))
68
+ return tokenizer(cleaned_reviews, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
69
+
70
+ # Custom Keras Layer
71
+ class BertLayer(Layer):
72
+ def __init__(self, base_model, **kwargs):
73
+ super(BertLayer, self).__init__(**kwargs)
74
+ self.base_model = base_model
75
+
76
+ def call(self, inputs):
77
+ input_ids, attention_mask = inputs
78
+ outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
79
+ return outputs.last_hidden_state
80
+
81
+ def get_config(self):
82
+ config = super(BertLayer, self).get_config()
83
+ config.update({"base_model": self.base_model})
84
+ return config
85
+
86
+ # Add Pooler Layer (from the first [CLS] token)
87
+ class PoolerLayer(Layer):
88
+ def __init__(self, **kwargs):
89
+ super(PoolerLayer, self).__init__(**kwargs)
90
+
91
+ def call(self, inputs):
92
+ cls_token = inputs[:, 0, :] # First token's output (the [CLS] token)
93
+ pooled_output = tf.keras.activations.tanh(cls_token) # Apply tanh activation
94
+ return pooled_output
95
+
96
+ # Custom F1 Score Metric
97
+ class F1Score(tf.keras.metrics.Metric):
98
+ def __init__(self, name="f1_score", **kwargs):
99
+ super(F1Score, self).__init__(name=name, **kwargs)
100
+ self.true_positives = self.add_weight(name="tp", initializer="zeros")
101
+ self.false_positives = self.add_weight(name="fp", initializer="zeros")
102
+ self.false_negatives = self.add_weight(name="fn", initializer="zeros")
103
+
104
+ def update_state(self, y_true, y_pred, sample_weight=None):
105
+ y_pred = tf.argmax(y_pred, axis=-1)
106
+ y_true = tf.argmax(y_true, axis=-1)
107
+ tp = tf.reduce_sum(tf.cast((y_true == y_pred) & (y_true != 0), tf.float32))
108
+ fp = tf.reduce_sum(tf.cast((y_true != y_pred) & (y_pred != 0), tf.float32))
109
+ fn = tf.reduce_sum(tf.cast((y_true != y_pred) & (y_true != 0), tf.float32))
110
+ self.true_positives.assign_add(tp)
111
+ self.false_positives.assign_add(fp)
112
+ self.false_negatives.assign_add(fn)
113
+
114
+ def result(self):
115
+ precision = self.true_positives / (self.true_positives + self.false_positives + K.epsilon())
116
+ recall = self.true_positives / (self.true_positives + self.false_negatives + K.epsilon())
117
+ f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
118
+ return f1
119
+
120
+ def reset_state(self):
121
+ self.true_positives.assign(0)
122
+ self.false_positives.assign(0)
123
+ self.false_negatives.assign(0)
124
+
125
+ # Load model and tokenizer
126
+ def load_model_and_tokenizer():
127
+ try:
128
+ model_path = 'best_model.h5'
129
+ if os.path.exists(model_path):
130
+ model = load_model(model_path, custom_objects={'TFBertModel': TFBertModel, 'BertLayer': BertLayer, 'PoolerLayer': PoolerLayer, 'F1Score': F1Score})
131
+ else:
132
+ st.error("Model file not found. Please check the file path.")
133
+ return None, None
134
+ except Exception as e:
135
+ logging.error(f"Error loading model: {e}")
136
+ st.error("Failed to load the model. Please check the model file and try again.")
137
+ return None, None
138
+
139
+ # Recreate the AdamW optimizer
140
+ optimizer = AdamW(learning_rate=2e-5, weight_decay=1e-5)
141
+
142
+ # Recompile the model with the AdamW optimizer
143
+ model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[F1Score()])
144
+
145
+ # Load tokenizer from the tokenizer folder
146
+ try:
147
+ tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
148
+ except Exception as e:
149
+ logging.error(f"Error loading tokenizer: {e}")
150
+ st.error("Failed to load the tokenizer. Please check the tokenizer files.")
151
+ return None, None
152
+
153
+ return model, tokenizer
154
+
155
+
156
+ # Sentiment mapping
157
+ sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
158
+
159
+ # Run Streamlit app
160
+ def run(model, tokenizer, stop_words, stemmer):
161
+ # Add a banner image
162
+ banner_image = "https://businessnews.co.id/wp-content/uploads/2021/04/Screenshot_112.jpg"
163
+ st.image(banner_image, use_column_width=True)
164
+
165
+
166
+ # Set title and description
167
+ st.title('Sentiment Analysis using IndoBERT')
168
+ st.subheader('This application analyzes the sentiment of user-provided reviews.')
169
+
170
+
171
+
172
+
173
+ # Input form
174
+ with st.form(key='review_form'):
175
+ review_input = st.text_area("Enter Review:", height=150)
176
+ submit_button = st.form_submit_button("Analyze Sentiment")
177
+
178
+ if submit_button:
179
+ if review_input:
180
+ # Preprocess and tokenize the review
181
+ tokenized_review = preprocess_and_tokenize_reviews([review_input], tokenizer, stop_words, stemmer)
182
+
183
+ # Make prediction
184
+ if model:
185
+ predictions = model.predict({'input_ids': tokenized_review['input_ids'], 'attention_mask': tokenized_review['attention_mask']})
186
+ predicted_label = np.argmax(predictions, axis=-1)
187
+ sentiment = sentiment_map[predicted_label[0]]
188
+ st.write(f"### Predicted Sentiment: {sentiment}")
189
+ else:
190
+ st.error("Model is not loaded. Please check the model file and try again.")
191
+ else:
192
+ st.error("Please enter a review to analyze.")
193
+
194
+ if __name__ == "__main__":
195
+ # Load necessary components
196
+ df = load_dataset()
197
+ model, tokenizer = load_model_and_tokenizer()
198
+
199
+ if df is not None and model is not None and tokenizer is not None:
200
+ # Preprocess dataset and prepare stopwords and stemmer
201
+ manual_stopwords = ["di", "ke", "dari", "yang", "dan", "atau", "dengan", "untuk", "ini", "itu", "aja", "saja", "lah", "bri", "brimo", "aplikasi", "rekening", "coba", "yg", "ke", "untuk", "nya", "saya", "dia", "dan", "sangat", "video", "login", "apk", "jadi", "akun", "malah", "uang", "banget", "dalam", "atm", "padahal"]
202
+ stop_words = set(stopwords.words('indonesian'))
203
+ stop_words.update(manual_stopwords)
204
+ factory = StemmerFactory()
205
+ stemmer = factory.create_stemmer()
206
+ df['label'] = df['score'].apply(map_labels)
207
+ run(model, tokenizer, stop_words, stemmer)
208
+ else:
209
+ if df is None:
210
+ logging.error("Failed to load dataset.")
211
+ st.error("Failed to load the dataset. Please check the dataset file.")
212
+ if model is None or tokenizer is None:
213
+ logging.error("Failed to load model or tokenizer.")
214
  st.error("Failed to load the model or tokenizer. Please check the model file.")