rsakadewa7 commited on
Commit
1856939
·
verified ·
1 Parent(s): f70da79

First Commit

Browse files
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import libraries
2
+ import streamlit as st
3
+ import predict
4
+ import logging
5
+
6
+ # Logging configuration
7
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
8
+
9
+ # Load necessary components
10
+ df = predict.load_dataset()
11
+ model, tokenizer = predict.load_model_and_tokenizer()
12
+
13
+ if df is not None and model is not None and tokenizer is not None:
14
+ # Preprocess dataset and prepare stopwords and stemmer
15
+ manual_stopwords = ["di", "ke", "dari", "yang", "dan", "atau", "dengan", "untuk", "ini", "itu", "aja", "saja", "lah", "bri", "brimo", "aplikasi", "rekening", "coba", "yg", "ke", "untuk", "nya", "saya", "dia", "dan", "sangat", "video", "login", "apk", "jadi", "akun", "malah", "uang", "banget", "dalam", "atm", "padahal"]
16
+ stop_words = set(predict.stopwords.words('indonesian'))
17
+ stop_words.update(manual_stopwords)
18
+ factory = predict.StemmerFactory()
19
+ stemmer = factory.create_stemmer()
20
+ df['label'] = df['score'].apply(predict.map_labels)
21
+
22
+ navigation = st.sidebar.selectbox('Choose Page:', ('predictor', 'eda'))
23
+
24
+ if navigation == 'predictor':
25
+ predict.run(model, tokenizer, stop_words, stemmer)
26
+ # elif navigation == 'eda':
27
+ # eda.run()
28
+ else:
29
+ if df is None:
30
+ logging.error("Failed to load dataset.")
31
+ st.error("Failed to load the dataset. Please check the dataset file.")
32
+ if model is None or tokenizer is None:
33
+ logging.error("Failed to load model or tokenizer.")
34
+ st.error("Failed to load the model or tokenizer. Please check the model file.")
best_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:207d19ec305831f9f0e1bf5ec0d7f76b79d912de521c39a405bf4ff42890f26e
3
+ size 498080376
predict.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import libraries
2
+ import logging
3
+ import re
4
+ import pandas as pd
5
+ import numpy as np
6
+ import tensorflow as tf
7
+ import nltk
8
+ from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
9
+ from transformers import AutoTokenizer, TFBertModel
10
+ from tensorflow.keras import backend as K
11
+ from tensorflow.keras.models import load_model
12
+ from tensorflow.keras.layers import Layer
13
+ from tensorflow_addons.optimizers import AdamW
14
+ import streamlit as st
15
+ from nltk.corpus import stopwords
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ import kagglehub
18
+ import os
19
+
20
+ # Text Processing
21
+ nltk.download('punkt')
22
+ nltk.download('stopwords')
23
+
24
+ # Logging configuration
25
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
26
+
27
+ # Load dataset
28
+ def load_dataset():
29
+ try:
30
+ path = kagglehub.dataset_download("dannytheodore/brimo-app-review")
31
+ dataset_path = f"{path}/brimo_googleplaystore_review.csv"
32
+ return pd.read_csv(dataset_path, index_col=0)
33
+ except Exception as e:
34
+ logging.error(f"Error loading dataset: {e}")
35
+ st.error("Failed to load the dataset.")
36
+ return None
37
+
38
+ # Map the labels to positive, neutral, negative
39
+ def map_labels(score):
40
+ if score >= 4:
41
+ return 2 # Positive
42
+ elif score == 3:
43
+ return 1 # Neutral
44
+ else:
45
+ return 0 # Negative
46
+
47
+ # Preprocess text
48
+ def preprocess_text(text, stop_words, stemmer):
49
+ try:
50
+ text = text.lower()
51
+ text = re.sub(r"@[A-Za-z0-9_]+", " ", text) # Remove mentions
52
+ text = re.sub(r"#[A-Za-z0-9_]+", " ", text) # Remove hashtags
53
+ text = re.sub(r"http\S+", " ", text) # Remove URLs
54
+ text = re.sub(r"www.\S+", " ", text) # Remove www URLs
55
+ text = re.sub(r"[^A-Za-z\s']", " ", text) # Remove non-letter characters
56
+ tokens = text.split()
57
+ tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
58
+ tokens = [stemmer.stem(word) for word in tokens] # Apply stemming
59
+ return ' '.join(tokens)
60
+ except Exception as e:
61
+ logging.error(f"Error processing text: {text}\n{e}")
62
+ return text
63
+
64
+ # Preprocess and tokenize reviews asynchronously
65
+ def preprocess_and_tokenize_reviews(reviews, tokenizer, stop_words, stemmer, max_length=128):
66
+ with ThreadPoolExecutor() as executor:
67
+ cleaned_reviews = list(executor.map(lambda x: preprocess_text(x, stop_words, stemmer), reviews))
68
+ return tokenizer(cleaned_reviews, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
69
+
70
+ # Custom Keras Layer
71
+ class BertLayer(Layer):
72
+ def __init__(self, base_model, **kwargs):
73
+ super(BertLayer, self).__init__(**kwargs)
74
+ self.base_model = base_model
75
+
76
+ def call(self, inputs):
77
+ input_ids, attention_mask = inputs
78
+ outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
79
+ return outputs.last_hidden_state
80
+
81
+ def get_config(self):
82
+ config = super(BertLayer, self).get_config()
83
+ config.update({"base_model": self.base_model})
84
+ return config
85
+
86
+ # Add Pooler Layer (from the first [CLS] token)
87
+ class PoolerLayer(Layer):
88
+ def __init__(self, **kwargs):
89
+ super(PoolerLayer, self).__init__(**kwargs)
90
+
91
+ def call(self, inputs):
92
+ cls_token = inputs[:, 0, :] # First token's output (the [CLS] token)
93
+ pooled_output = tf.keras.activations.tanh(cls_token) # Apply tanh activation
94
+ return pooled_output
95
+
96
+ # Custom F1 Score Metric
97
+ class F1Score(tf.keras.metrics.Metric):
98
+ def __init__(self, name="f1_score", **kwargs):
99
+ super(F1Score, self).__init__(name=name, **kwargs)
100
+ self.true_positives = self.add_weight(name="tp", initializer="zeros")
101
+ self.false_positives = self.add_weight(name="fp", initializer="zeros")
102
+ self.false_negatives = self.add_weight(name="fn", initializer="zeros")
103
+
104
+ def update_state(self, y_true, y_pred, sample_weight=None):
105
+ y_pred = tf.argmax(y_pred, axis=-1)
106
+ y_true = tf.argmax(y_true, axis=-1)
107
+ tp = tf.reduce_sum(tf.cast((y_true == y_pred) & (y_true != 0), tf.float32))
108
+ fp = tf.reduce_sum(tf.cast((y_true != y_pred) & (y_pred != 0), tf.float32))
109
+ fn = tf.reduce_sum(tf.cast((y_true != y_pred) & (y_true != 0), tf.float32))
110
+ self.true_positives.assign_add(tp)
111
+ self.false_positives.assign_add(fp)
112
+ self.false_negatives.assign_add(fn)
113
+
114
+ def result(self):
115
+ precision = self.true_positives / (self.true_positives + self.false_positives + K.epsilon())
116
+ recall = self.true_positives / (self.true_positives + self.false_negatives + K.epsilon())
117
+ f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
118
+ return f1
119
+
120
+ def reset_state(self):
121
+ self.true_positives.assign(0)
122
+ self.false_positives.assign(0)
123
+ self.false_negatives.assign(0)
124
+
125
+ # Load model and tokenizer
126
+ def load_model_and_tokenizer():
127
+ try:
128
+ model_path = 'best_model.h5'
129
+ if os.path.exists(model_path):
130
+ model = load_model(model_path, custom_objects={'TFBertModel': TFBertModel, 'BertLayer': BertLayer, 'PoolerLayer': PoolerLayer, 'F1Score': F1Score})
131
+ else:
132
+ st.error("Model file not found. Please check the file path.")
133
+ return None, None
134
+ except Exception as e:
135
+ logging.error(f"Error loading model: {e}")
136
+ st.error("Failed to load the model. Please check the model file and try again.")
137
+ return None, None
138
+
139
+ # Recreate the AdamW optimizer
140
+ optimizer = AdamW(learning_rate=2e-5, weight_decay=1e-5)
141
+
142
+ # Recompile the model with the AdamW optimizer
143
+ model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[F1Score()])
144
+
145
+ # Load tokenizer from the tokenizer folder
146
+ try:
147
+ tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
148
+ except Exception as e:
149
+ logging.error(f"Error loading tokenizer: {e}")
150
+ st.error("Failed to load the tokenizer. Please check the tokenizer files.")
151
+ return None, None
152
+
153
+ return model, tokenizer
154
+
155
+
156
+ # Sentiment mapping
157
+ sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
158
+
159
+ # Run Streamlit app
160
+ def run(model, tokenizer, stop_words, stemmer):
161
+ # Set title and description
162
+ st.title('Sentiment Analysis using IndoBERT')
163
+ st.subheader('This application analyzes the sentiment of user-provided reviews.')
164
+
165
+ # Input form
166
+ with st.form(key='review_form'):
167
+ review_input = st.text_area("Enter Review:", height=150)
168
+ submit_button = st.form_submit_button("Analyze Sentiment")
169
+
170
+ if submit_button:
171
+ if review_input:
172
+ # Preprocess and tokenize the review
173
+ tokenized_review = preprocess_and_tokenize_reviews([review_input], tokenizer, stop_words, stemmer)
174
+
175
+ # Make prediction
176
+ if model:
177
+ predictions = model.predict({'input_ids': tokenized_review['input_ids'], 'attention_mask': tokenized_review['attention_mask']})
178
+ predicted_label = np.argmax(predictions, axis=-1)
179
+ sentiment = sentiment_map[predicted_label[0]]
180
+ st.write(f"### Predicted Sentiment: {sentiment}")
181
+ else:
182
+ st.error("Model is not loaded. Please check the model file and try again.")
183
+ else:
184
+ st.error("Please enter a review to analyze.")
185
+
186
+ if __name__ == "__main__":
187
+ # Load necessary components
188
+ df = load_dataset()
189
+ model, tokenizer = load_model_and_tokenizer()
190
+
191
+ if df is not None and model is not None and tokenizer is not None:
192
+ # Preprocess dataset and prepare stopwords and stemmer
193
+ manual_stopwords = ["di", "ke", "dari", "yang", "dan", "atau", "dengan", "untuk", "ini", "itu", "aja", "saja", "lah", "bri", "brimo", "aplikasi", "rekening", "coba", "yg", "ke", "untuk", "nya", "saya", "dia", "dan", "sangat", "video", "login", "apk", "jadi", "akun", "malah", "uang", "banget", "dalam", "atm", "padahal"]
194
+ stop_words = set(stopwords.words('indonesian'))
195
+ stop_words.update(manual_stopwords)
196
+ factory = StemmerFactory()
197
+ stemmer = factory.create_stemmer()
198
+ df['label'] = df['score'].apply(map_labels)
199
+ run(model, tokenizer, stop_words, stemmer)
200
+ else:
201
+ if df is None:
202
+ logging.error("Failed to load dataset.")
203
+ st.error("Failed to load the dataset. Please check the dataset file.")
204
+ if model is None or tokenizer is None:
205
+ logging.error("Failed to load model or tokenizer.")
206
+ st.error("Failed to load the model or tokenizer. Please check the model file.")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ tensorflow==2.11.0
2
+ tensorflow-addons
3
+ Sastrawi
4
+ datasets
5
+ transformers
6
+ kagglehub
7
+ nltk
8
+ streamlit
9
+ pandas
10
+ numpy
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff