prthgo commited on
Commit
6335d24
·
1 Parent(s): e04b6b4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1018 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,1018 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from matplotlib.ticker import PercentFormatter
7
+ import seaborn as sns
8
+ from sklearn.preprocessing import (
9
+ OneHotEncoder,
10
+ OrdinalEncoder,
11
+ StandardScaler,
12
+ MinMaxScaler,
13
+ )
14
+ from sklearn.model_selection import train_test_split
15
+ from imblearn.under_sampling import RandomUnderSampler
16
+ from imblearn.over_sampling import RandomOverSampler, SMOTE
17
+ from sklearn.linear_model import Ridge, Lasso, LogisticRegression
18
+ from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
19
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
20
+ from sklearn.svm import SVR, SVC
21
+ from sklearn.naive_bayes import MultinomialNB
22
+ from xgboost import XGBRFRegressor, XGBRFClassifier
23
+ from lightgbm import LGBMRegressor, LGBMClassifier
24
+ from sklearn.metrics import (
25
+ mean_absolute_error,
26
+ mean_squared_error,
27
+ mean_squared_error,
28
+ r2_score,
29
+ )
30
+ from sklearn.metrics import (
31
+ accuracy_score,
32
+ f1_score,
33
+ roc_auc_score,
34
+ confusion_matrix,
35
+ )
36
+ import pickle
37
+
38
+ st.set_page_config(page_title="Tabular Data Analysis and Auto ML", page_icon="🤖")
39
+ sns.set_style("white")
40
+ sns.set_context("poster", font_scale=0.7)
41
+ palette = [
42
+ "#1d7874",
43
+ "#679289",
44
+ "#f4c095",
45
+ "#ee2e31",
46
+ "#ffb563",
47
+ "#918450",
48
+ "#f85e00",
49
+ "#a41623",
50
+ "#9a031e",
51
+ "#d6d6d6",
52
+ "#ffee32",
53
+ "#ffd100",
54
+ "#333533",
55
+ "#202020",
56
+ ]
57
+
58
+
59
+ def main():
60
+ file = st.sidebar.file_uploader("Upload Your CSV File Here: ")
61
+ process = st.sidebar.button("Process")
62
+ option = st.sidebar.radio(
63
+ "Select an Option: ",
64
+ (
65
+ "Basic EDA",
66
+ "Univariate Analysis",
67
+ "Bivariate Analysis",
68
+ "Preprocess",
69
+ "Training and Evaluation",
70
+ ),
71
+ )
72
+ placeholder = st.empty()
73
+ placeholder.markdown(
74
+ "<h1 style='text-align: center;'>Welcome to Tabular Data Analysis and Auto ML🤖</h1>",
75
+ unsafe_allow_html=True
76
+ )
77
+
78
+
79
+ if file is not None and process:
80
+ data = load_csv(file)
81
+ st.session_state["data"] = data
82
+
83
+ if "data" in st.session_state:
84
+ data = st.session_state["data"]
85
+ placeholder.empty()
86
+
87
+ if option == "Basic EDA":
88
+ st.markdown(
89
+ "<h1 style='text-align: center;'>Basic EDA</h1>", unsafe_allow_html=True
90
+ )
91
+
92
+ st.subheader("Data Overview")
93
+ st.write(data_overview(data))
94
+ st.write(duplicate(data))
95
+ st.dataframe(data.head())
96
+
97
+ st.subheader("Data Types and Unique Value Counts")
98
+ display_data_info(data)
99
+
100
+ st.subheader("Missing Data")
101
+ missing_data(data)
102
+
103
+ st.subheader("Value Counts")
104
+ value_counts(data)
105
+
106
+ st.subheader("Descriptive Statistics")
107
+ st.write(data.describe().T)
108
+
109
+ if option == "Univariate Analysis":
110
+ st.markdown(
111
+ "<h1 style='text-align: center;'>Univariate Analysis</h1>",
112
+ unsafe_allow_html=True,
113
+ )
114
+ plot = st.radio(
115
+ "Select a chart: ",
116
+ ("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"),
117
+ )
118
+
119
+ if plot == "Count Plot":
120
+ column = st.selectbox(
121
+ "Select a column", [""] + list(data.select_dtypes("O"))
122
+ )
123
+ if column:
124
+ countplot(data, column)
125
+
126
+ if plot == "Pie Chart":
127
+ column = st.selectbox(
128
+ "Select a column", [""] + list(data.select_dtypes("O"))
129
+ )
130
+ if column:
131
+ piechart(data, column)
132
+
133
+ if plot == "Histogram":
134
+ column = st.selectbox(
135
+ "Select a column",
136
+ [""] + list(data.select_dtypes(include=["int", "float"])),
137
+ )
138
+ if column:
139
+ histogram(data, column)
140
+
141
+ if plot == "Violin Plot":
142
+ column = st.selectbox(
143
+ "Select a column",
144
+ [""] + list(data.select_dtypes(include=["int", "float"])),
145
+ )
146
+ if column:
147
+ violinplot(data, column)
148
+
149
+ if plot == "Scatter Plot":
150
+ column = st.selectbox(
151
+ "Select a column",
152
+ [""] + list(data.select_dtypes(include=["int", "float"])),
153
+ )
154
+ if column:
155
+ scatterplot(data, column)
156
+
157
+ if option == "Bivariate Analysis":
158
+ st.markdown(
159
+ "<h1 style='text-align: center;'>Bivariate Analysis</h1>",
160
+ unsafe_allow_html=True,
161
+ )
162
+ plot = st.radio(
163
+ "Select a chart: ",
164
+ ("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"),
165
+ )
166
+
167
+ if plot == "Scatter Plot":
168
+ columns = st.multiselect(
169
+ "Select two columns",
170
+ [""] + list(data.select_dtypes(include=["int", "float"])),
171
+ )
172
+
173
+ if columns:
174
+ biscatterplot(data, columns)
175
+
176
+ if plot == "Bar Plot":
177
+ columns = st.multiselect("Select two columns", list(data.columns))
178
+
179
+ if columns:
180
+ bibarplot(data, columns)
181
+
182
+ if plot == "Box Plot":
183
+ columns = st.multiselect("Select two columns", list(data.columns))
184
+
185
+ if columns:
186
+ biboxplot(data, columns)
187
+
188
+ if plot == "Pareto Chart":
189
+ column = st.selectbox(
190
+ "Select a columns",
191
+ [""] + list(data.select_dtypes(include="object")),
192
+ )
193
+
194
+ if column:
195
+ paretoplot(data, column)
196
+
197
+ if option == "Preprocess":
198
+ st.markdown(
199
+ "<h1 style='text-align: center;'>Data Preprocessing</h1>",
200
+ unsafe_allow_html=True,
201
+ )
202
+
203
+ operation = st.radio(
204
+ "Select preprocessing step: ",
205
+ (
206
+ "Drop Columns",
207
+ "Handling Missing Values",
208
+ "Encode Categorical Features",
209
+ ),
210
+ )
211
+
212
+ if operation == "Drop Columns":
213
+ columns = st.multiselect("Select Columns to drop: ", (data.columns))
214
+ drop_columns = st.button("Drop Columns")
215
+ if drop_columns:
216
+ data.drop(columns, axis=1, inplace=True)
217
+ st.success("Dropped selected columns✅✅✅")
218
+
219
+ elif operation == "Handling Missing Values":
220
+ num_missing = st.selectbox(
221
+ "Select a Approach (Numerical columns only): ",
222
+ ("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"),
223
+ ).lower()
224
+
225
+ cat_missing = st.selectbox(
226
+ "Select a Approach (Categorical columns only): ",
227
+ ("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"),
228
+ ).lower()
229
+ hmv = st.button("Handle Missing Values")
230
+
231
+ if hmv:
232
+ if num_missing:
233
+ num_data = data.select_dtypes(include=["int64", "float64"])
234
+
235
+ if num_missing == "drop":
236
+ data = data.dropna(subset=num_data.columns)
237
+
238
+ elif num_missing in [
239
+ "mean",
240
+ "median",
241
+ "backward fill",
242
+ "forward fill",
243
+ ]:
244
+ if num_missing == "mean":
245
+ fill_values = num_data.mean()
246
+ elif num_missing == "median":
247
+ fill_values = num_data.median()
248
+ elif num_missing == "backward fill":
249
+ fill_values = num_data.bfill()
250
+ elif num_missing == "forward fill":
251
+ fill_values = num_data.ffill()
252
+
253
+ data.fillna(value=fill_values, inplace=True)
254
+
255
+ st.success(
256
+ "Imputed missing values in numerical columns with selected approach."
257
+ )
258
+
259
+ if cat_missing:
260
+ cat_data = data.select_dtypes(exclude=["int", "float"])
261
+
262
+ if cat_missing == "drop":
263
+ data = data.dropna(subset=cat_data.columns)
264
+
265
+ elif cat_missing == "most frequent values":
266
+ mode_values = data[cat_data.columns].mode().iloc[0]
267
+ data[cat_data.columns] = data[cat_data.columns].fillna(
268
+ mode_values
269
+ )
270
+
271
+ elif cat_missing == "replace with 'unknown'":
272
+ data[cat_data.columns] = data[cat_data.columns].fillna(
273
+ "Unknown"
274
+ )
275
+
276
+ st.success(
277
+ "Imputed missing values in categorical columns with selected approach."
278
+ )
279
+
280
+ elif operation == "Encode Categorical Features":
281
+ oe_columns = st.multiselect(
282
+ "Choose Columns for Ordinal Encoding",
283
+ [""] + list(data.select_dtypes(include="object")),
284
+ )
285
+ st.info("Other columns will be One Hot Encoded.")
286
+
287
+ encode_columns = st.button("Encode Columns")
288
+
289
+ if encode_columns:
290
+ bool_columns = data.select_dtypes(include=bool).columns
291
+ data[bool_columns] = data[bool_columns].astype(int)
292
+ if oe_columns:
293
+ oe = OrdinalEncoder()
294
+ data[oe_columns] = oe.fit_transform(
295
+ data[oe_columns].astype("str")
296
+ )
297
+
298
+ try:
299
+ remaining_cat_cols = [
300
+ col
301
+ for col in data.select_dtypes(include="object")
302
+ if col not in oe_columns
303
+ ]
304
+ except:
305
+ pass
306
+
307
+ if len(remaining_cat_cols) > 0:
308
+ data = pd.get_dummies(
309
+ data, columns=remaining_cat_cols, drop_first=False
310
+ )
311
+ bool_columns = data.select_dtypes(include=bool).columns
312
+ data[bool_columns] = data[bool_columns].astype(int)
313
+
314
+ st.success("Encoded categorical columns")
315
+
316
+ preprocessed_data_csv = data.to_csv(index=False)
317
+
318
+ # Create a StringIO object to handle the data
319
+ preprocessed_data_buffer = io.StringIO()
320
+ preprocessed_data_buffer.write(preprocessed_data_csv)
321
+ preprocessed_data_bytes = preprocessed_data_buffer.getvalue()
322
+
323
+ # Now you can add a download button for the preprocessed data
324
+ if st.download_button(
325
+ label="Download Preprocessed Data",
326
+ key="preprocessed_data",
327
+ on_click=None,
328
+ data=preprocessed_data_bytes.encode(),
329
+ file_name="preprocessed_data.csv",
330
+ mime="text/csv",
331
+ ):
332
+ pass
333
+
334
+
335
+ if option == "Training and Evaluation":
336
+ st.markdown(
337
+ "<h1 style='text-align: center;'>Training and Evaluation</h1>",
338
+ unsafe_allow_html=True,
339
+ )
340
+ algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification"))
341
+
342
+ if algo == "Regression":
343
+ target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
344
+
345
+ try:
346
+ X = data.drop(target, axis=1)
347
+ Y = data[target]
348
+ except Exception as e:
349
+ st.write(str(e))
350
+
351
+ st.write(
352
+ "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
353
+ )
354
+ X_train, X_test, y_train, y_test = train_test_split(
355
+ X, Y, test_size=0.2, random_state=42
356
+ )
357
+
358
+ scale = st.selectbox(
359
+ "Choose how do you want to scale features:",
360
+ ("", "Standard Scaler", "Min Max Scaler"),
361
+ )
362
+
363
+ if scale == "Standard Scaler":
364
+ scaler = StandardScaler()
365
+ X_train = scaler.fit_transform(X_train)
366
+ X_test = scaler.transform(X_test)
367
+
368
+ elif scale == "Min Max Scaler":
369
+ scaler = MinMaxScaler()
370
+ X_train = scaler.fit_transform(X_train)
371
+ X_test = scaler.transform(X_test)
372
+
373
+ model = st.selectbox(
374
+ "Choose Regression Model for training: ",
375
+ (
376
+ "",
377
+ "Ridge Regression",
378
+ "Decision Tree Regressor",
379
+ "Random Forest Regressor",
380
+ "SVR",
381
+ "XGBRF Regressor",
382
+ "LGBM Regressor",
383
+ ),
384
+ )
385
+
386
+ if model == "Ridge Regression":
387
+ reg = Ridge(alpha=1.0)
388
+ reg.fit(X_train, y_train)
389
+ pred = reg.predict(X_test)
390
+ st.write(
391
+ "Mean Absolute Error (MAE): {:.4f}".format(
392
+ mean_absolute_error(pred, y_test)
393
+ )
394
+ )
395
+ st.write(
396
+ "Mean Squared Error (MSE): {:.4f}".format(
397
+ mean_squared_error(pred, y_test)
398
+ )
399
+ )
400
+ st.write(
401
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
402
+ mean_squared_error(pred, y_test, squared=False)
403
+ )
404
+ )
405
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
406
+
407
+ if st.download_button(
408
+ label="Download Trained Model",
409
+ key="trained_model",
410
+ on_click=None,
411
+ data=pickle.dumps(reg),
412
+ file_name="ridge_regression_model.pkl",
413
+ mime="application/octet-stream",
414
+ ):
415
+ with open("ridge_regression_model.pkl", "wb") as model_file:
416
+ pickle.dump(reg, model_file)
417
+
418
+ elif model == "Decision Tree Regressor":
419
+ reg = DecisionTreeRegressor(max_depth=10)
420
+ reg.fit(X_train, y_train)
421
+ pred = reg.predict(X_test)
422
+ st.write(
423
+ "Mean Absolute Error (MAE): {:.4f}".format(
424
+ mean_absolute_error(pred, y_test)
425
+ )
426
+ )
427
+ st.write(
428
+ "Mean Squared Error (MSE): {:.4f}".format(
429
+ mean_squared_error(pred, y_test)
430
+ )
431
+ )
432
+ st.write(
433
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
434
+ mean_squared_error(pred, y_test, squared=False)
435
+ )
436
+ )
437
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
438
+
439
+ if st.download_button(
440
+ label="Download Trained Model",
441
+ key="trained_model",
442
+ on_click=None,
443
+ data=pickle.dumps(reg),
444
+ file_name="decision_tree_regression_model.pkl",
445
+ mime="application/octet-stream",
446
+ ):
447
+ with open(
448
+ "decision_tree_regression_model.pkl", "wb"
449
+ ) as model_file:
450
+ pickle.dump(reg, model_file)
451
+
452
+ elif model == "Random Forest Regressor":
453
+ reg = RandomForestRegressor(max_depth=10, n_estimators=100)
454
+ reg.fit(X_train, y_train)
455
+ pred = reg.predict(X_test)
456
+ st.write(
457
+ "Mean Absolute Error (MAE): {:.4f}".format(
458
+ mean_absolute_error(pred, y_test)
459
+ )
460
+ )
461
+ st.write(
462
+ "Mean Squared Error (MSE): {:.4f}".format(
463
+ mean_squared_error(pred, y_test)
464
+ )
465
+ )
466
+ st.write(
467
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
468
+ mean_squared_error(pred, y_test, squared=False)
469
+ )
470
+ )
471
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
472
+
473
+ if st.download_button(
474
+ label="Download Trained Model",
475
+ key="trained_model",
476
+ on_click=None,
477
+ data=pickle.dumps(reg),
478
+ file_name="random_forest_regression_model.pkl",
479
+ mime="application/octet-stream",
480
+ ):
481
+ with open(
482
+ "random_forest_regression_model.pkl", "wb"
483
+ ) as model_file:
484
+ pickle.dump(reg, model_file)
485
+
486
+ elif model == "SVR":
487
+ reg = SVR(C=1.0, epsilon=0.2)
488
+ reg.fit(X_train, y_train)
489
+ pred = reg.predict(X_test)
490
+ st.write(
491
+ "Mean Absolute Error (MAE): {:.4f}".format(
492
+ mean_absolute_error(pred, y_test)
493
+ )
494
+ )
495
+ st.write(
496
+ "Mean Squared Error (MSE): {:.4f}".format(
497
+ mean_squared_error(pred, y_test)
498
+ )
499
+ )
500
+ st.write(
501
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
502
+ mean_squared_error(pred, y_test, squared=False)
503
+ )
504
+ )
505
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
506
+
507
+ if st.download_button(
508
+ label="Download Trained Model",
509
+ key="trained_model",
510
+ on_click=None,
511
+ data=pickle.dumps(reg),
512
+ file_name="svr_model.pkl",
513
+ mime="application/octet-stream",
514
+ ):
515
+ with open("svr_model.pkl", "wb") as model_file:
516
+ pickle.dump(reg, model_file)
517
+
518
+ elif model == "XGBRF Regressor":
519
+ reg = XGBRFRegressor(reg_lambda=1)
520
+ reg.fit(X_train, y_train)
521
+ pred = reg.predict(X_test)
522
+ st.write(
523
+ "Mean Absolute Error (MAE): {:.4f}".format(
524
+ mean_absolute_error(pred, y_test)
525
+ )
526
+ )
527
+ st.write(
528
+ "Mean Squared Error (MSE): {:.4f}".format(
529
+ mean_squared_error(pred, y_test)
530
+ )
531
+ )
532
+ st.write(
533
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
534
+ mean_squared_error(pred, y_test, squared=False)
535
+ )
536
+ )
537
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
538
+
539
+ if st.download_button(
540
+ label="Download Trained Model",
541
+ key="trained_model",
542
+ on_click=None,
543
+ data=pickle.dumps(reg),
544
+ file_name="xgbrf_regression_model.pkl",
545
+ mime="application/octet-stream",
546
+ ):
547
+ with open("xgbrf_regression_model.pkl", "wb") as model_file:
548
+ pickle.dump(reg, model_file)
549
+
550
+ elif model == "LGBM Regressor":
551
+ reg = LGBMRegressor(reg_lambda=1)
552
+ reg.fit(X_train, y_train)
553
+ pred = reg.predict(X_test)
554
+ st.write(
555
+ "Mean Absolute Error (MAE): {:.4f}".format(
556
+ mean_absolute_error(pred, y_test)
557
+ )
558
+ )
559
+ st.write(
560
+ "Mean Squared Error (MSE): {:.4f}".format(
561
+ mean_squared_error(pred, y_test)
562
+ )
563
+ )
564
+ st.write(
565
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
566
+ mean_squared_error(pred, y_test, squared=False)
567
+ )
568
+ )
569
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
570
+
571
+ if st.download_button(
572
+ label="Download Trained Model",
573
+ key="trained_model",
574
+ on_click=None,
575
+ data=pickle.dumps(reg),
576
+ file_name="lgbm_regression_model.pkl",
577
+ mime="application/octet-stream",
578
+ ):
579
+ with open("lgbm_regression_model.pkl", "wb") as model_file:
580
+ pickle.dump(reg, model_file)
581
+
582
+ elif algo == "Classification":
583
+ target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
584
+
585
+ try:
586
+ X = data.drop(target, axis=1)
587
+ Y = data[target]
588
+ except Exception as e:
589
+ st.write(str(e))
590
+
591
+ st.write(
592
+ "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
593
+ )
594
+ X_train, X_test, y_train, y_test = train_test_split(
595
+ X, Y, test_size=0.2, random_state=42
596
+ )
597
+
598
+ balance = st.selectbox(
599
+ "Do you want to balance dataset?", ("", "Yes", "No")
600
+ )
601
+ if balance == "Yes":
602
+ piechart(data, target)
603
+
604
+ sample = st.selectbox(
605
+ "Which approach you want to use?",
606
+ ("", "Random Under Sampling", "Random Over Sampling", "SMOTE"),
607
+ )
608
+
609
+ if sample == "Random Under Sampling":
610
+ rus = RandomUnderSampler(random_state=42)
611
+ X_train, y_train = rus.fit_resample(X_train, y_train)
612
+
613
+ elif sample == "Random Over Sampling":
614
+ ros = RandomOverSampler(random_state=42)
615
+ X_train, y_train = ros.fit_resample(X_train, y_train)
616
+
617
+ elif sample == "SMOTE":
618
+ smote = SMOTE(random_state=42)
619
+ X_train, y_train = smote.fit_resample(X_train, y_train)
620
+
621
+ scale = st.selectbox(
622
+ "Choose how do you want to scale features:",
623
+ ("", "Standard Scaler", "Min Max Scaler"),
624
+ )
625
+
626
+ if scale == "Standard Scaler":
627
+ scaler = StandardScaler()
628
+ X_train = scaler.fit_transform(X_train)
629
+ X_test = scaler.transform(X_test)
630
+
631
+ elif scale == "Min Max Scaler":
632
+ scaler = MinMaxScaler()
633
+ X_train = scaler.fit_transform(X_train)
634
+ X_test = scaler.transform(X_test)
635
+
636
+ model = st.selectbox(
637
+ "Choose Classification Model for training: ",
638
+ (
639
+ "",
640
+ "Logistic Regression",
641
+ "Decision Tree Classifier",
642
+ "Random Forest Classifier",
643
+ "SVC",
644
+ "XGBRF Classifier",
645
+ "LGBM Classifier",
646
+ ),
647
+ )
648
+
649
+ if model == "Logistic Regression":
650
+ clf = LogisticRegression(penalty="l2")
651
+ clf.fit(X_train, y_train)
652
+ pred = clf.predict(X_test)
653
+ st.write(
654
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
655
+ )
656
+ try:
657
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
658
+ except ValueError:
659
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
660
+
661
+
662
+ plot_confusion_matrix(
663
+ pred, y_test, "Logistic Regression Confusion Matrix "
664
+ )
665
+
666
+ if st.download_button(
667
+ label="Download Trained Model",
668
+ key="trained_model",
669
+ on_click=None,
670
+ data=pickle.dumps(clf),
671
+ file_name="logistic_regression_model.pkl",
672
+ mime="application/octet-stream",
673
+ ):
674
+ with open("logistic_regression_model.pkl", "wb") as model_file:
675
+ pickle.dump(clf, model_file)
676
+
677
+ if model == "Decision Tree Classifier":
678
+ clf = DecisionTreeClassifier(max_depth=5)
679
+ clf.fit(X_train, y_train)
680
+ pred = clf.predict(X_test)
681
+ st.write(
682
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
683
+ )
684
+ try:
685
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
686
+ except ValueError:
687
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
688
+
689
+ plot_confusion_matrix(
690
+ pred, y_test, "DecisionTree Classifier Confusion Matrix "
691
+ )
692
+
693
+ if st.download_button(
694
+ label="Download Trained Model",
695
+ key="trained_model",
696
+ on_click=None,
697
+ data=pickle.dumps(clf),
698
+ file_name="decision_tree_classifier_model.pkl",
699
+ mime="application/octet-stream",
700
+ ):
701
+ with open(
702
+ "decision_tree_classifier_model.pkl", "wb"
703
+ ) as model_file:
704
+ pickle.dump(clf, model_file)
705
+
706
+ if model == "Random Forest Classifier":
707
+ clf = RandomForestClassifier(n_estimators=100, max_depth=5)
708
+ clf.fit(X_train, y_train)
709
+ pred = clf.predict(X_test)
710
+ st.write(
711
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
712
+ )
713
+ try:
714
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
715
+ except ValueError:
716
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
717
+
718
+ plot_confusion_matrix(
719
+ pred, y_test, "RandomForest Classifier Confusion Matrix "
720
+ )
721
+
722
+ if st.download_button(
723
+ label="Download Trained Model",
724
+ key="trained_model",
725
+ on_click=None,
726
+ data=pickle.dumps(clf),
727
+ file_name="random_forest_classifier_model.pkl",
728
+ mime="application/octet-stream",
729
+ ):
730
+ with open(
731
+ "random_forest_classifier_model.pkl", "wb"
732
+ ) as model_file:
733
+ pickle.dump(clf, model_file)
734
+
735
+ if model == "SVC":
736
+ clf = SVC(C=1.5)
737
+ clf.fit(X_train, y_train)
738
+ pred = clf.predict(X_test)
739
+ st.write(
740
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
741
+ )
742
+ try:
743
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
744
+ except ValueError:
745
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
746
+
747
+
748
+ plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ")
749
+
750
+ if st.download_button(
751
+ label="Download Trained Model",
752
+ key="trained_model",
753
+ on_click=None,
754
+ data=pickle.dumps(clf),
755
+ file_name="svc_model.pkl",
756
+ mime="application/octet-stream",
757
+ ):
758
+ with open("svc_model.pkl", "wb") as model_file:
759
+ pickle.dump(clf, model_file)
760
+
761
+ if model == "XGBRF Classifier":
762
+ clf = XGBRFClassifier(reg_lambda=1.0)
763
+ clf.fit(X_train, y_train)
764
+ pred = clf.predict(X_test)
765
+ st.write(
766
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
767
+ )
768
+ try:
769
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
770
+ except ValueError:
771
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
772
+
773
+
774
+ plot_confusion_matrix(
775
+ pred, y_test, "XGBRF Classifier Confusion Matrix "
776
+ )
777
+
778
+ if st.download_button(
779
+ label="Download Trained Model",
780
+ key="trained_model",
781
+ on_click=None,
782
+ data=pickle.dumps(clf),
783
+ file_name="xgbrf_classifier_model.pkl",
784
+ mime="application/octet-stream",
785
+ ):
786
+ with open("xgbrf_classifier_model.pkl", "wb") as model_file:
787
+ pickle.dump(clf, model_file)
788
+
789
+ if model == "LGBM Classifier":
790
+ clf = LGBMClassifier(reg_lambda=1.0)
791
+ clf.fit(X_train, y_train)
792
+ pred = clf.predict(X_test)
793
+ st.write(
794
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
795
+ )
796
+ try:
797
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
798
+ except ValueError:
799
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
800
+
801
+ plot_confusion_matrix(
802
+ pred, y_test, "LGBM Classifier Confusion Matrix "
803
+ )
804
+
805
+ if st.download_button(
806
+ label="Download Trained Model",
807
+ key="trained_model",
808
+ on_click=None,
809
+ data=pickle.dumps(clf),
810
+ file_name="lgbm_classifier_model.pkl",
811
+ mime="application/octet-stream",
812
+ ):
813
+ with open("lgbm_classifier_model.pkl", "wb") as model_file:
814
+ pickle.dump(clf, model_file)
815
+
816
+
817
+ def load_csv(file):
818
+ data = pd.read_csv(file)
819
+ return data
820
+
821
+
822
+ def data_overview(data):
823
+ r, c = data.shape
824
+ st.write(f"Number of Rows: {r}")
825
+ return f"Number of Columns: {c}"
826
+
827
+
828
+ def missing_data(data):
829
+ missing_values = data.isna().sum()
830
+ missing_values = missing_values[missing_values > 0]
831
+ missing_value_per = (missing_values / data.shape[0]) * 100
832
+ missing_value_per = missing_value_per.round(2).astype(str) + "%"
833
+ missing_df = pd.DataFrame(
834
+ {"Missing Values": missing_values, "Percentage": missing_value_per}
835
+ )
836
+ missing_df_html = missing_df.to_html(
837
+ classes="table table-striped", justify="center"
838
+ )
839
+ return st.markdown(missing_df_html, unsafe_allow_html=True)
840
+
841
+
842
+ def display_data_info(data):
843
+ dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"])
844
+ dtypes.reset_index(inplace=True)
845
+ nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"])
846
+ nunique.reset_index(inplace=True)
847
+ dtypes.columns = ["Column", "Data Type"]
848
+ nunique.columns = ["Column", "Unique Counts"]
849
+ combined_df = pd.merge(dtypes, nunique, on="Column")
850
+ combined_df_html = combined_df.to_html(
851
+ classes="table table-striped", justify="center"
852
+ )
853
+ return st.markdown(combined_df_html, unsafe_allow_html=True)
854
+
855
+
856
+ def value_counts(data):
857
+ column = st.selectbox("Select a Column", [""] + list(data.columns))
858
+ if column:
859
+ st.write(data[column].value_counts())
860
+
861
+
862
+ def duplicate(data):
863
+ if data.duplicated().any():
864
+ st.write(
865
+ f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped."
866
+ )
867
+ data.drop_duplicates(keep="first", inplace=True)
868
+ return ""
869
+
870
+ else:
871
+ return "There are no duplicate rows in the DataFrame."
872
+
873
+
874
+ def countplot(data, col):
875
+ plt.figure(figsize=(10, 6))
876
+ sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2)
877
+ plt.title(f"Countplot of {col} Column")
878
+ st.pyplot(plt)
879
+
880
+
881
+ def piechart(data, col):
882
+ value_counts = data[col].value_counts()
883
+ plt.figure(figsize=(8, 6))
884
+ plt.pie(
885
+ value_counts,
886
+ labels=value_counts.index,
887
+ autopct="%1.1f%%",
888
+ colors=palette,
889
+ shadow=False,
890
+ wedgeprops=dict(edgecolor="#1c1c1c"),
891
+ )
892
+ plt.title(f"Pie Chart of {col} Column")
893
+ st.pyplot(plt)
894
+
895
+
896
+ def histogram(data, col):
897
+ plt.figure(figsize=(10, 6))
898
+ sns.histplot(
899
+ data[col],
900
+ kde=True,
901
+ color=palette[4],
902
+ fill=True,
903
+ edgecolor="#1c1c1c",
904
+ linewidth=2,
905
+ )
906
+ plt.title(f"Histogram of {col} Column")
907
+ st.pyplot(plt)
908
+
909
+
910
+ def violinplot(data, col):
911
+ plt.figure(figsize=(10, 6))
912
+ sns.violinplot(data[col], color=palette[8])
913
+ plt.title(f"Violin Plot of {col} Column")
914
+ st.pyplot(plt)
915
+
916
+
917
+ def scatterplot(data, col):
918
+ plt.figure(figsize=(10, 8))
919
+ sns.scatterplot(data[col], color=palette[3])
920
+ plt.title(f"Scatter Plot of {col} Column")
921
+ st.pyplot(plt)
922
+
923
+
924
+ def biscatterplot(data, cols):
925
+ try:
926
+ plt.figure(figsize=(10, 8))
927
+ sns.scatterplot(
928
+ data=data,
929
+ x=cols[0],
930
+ y=cols[1],
931
+ palette=palette[1:],
932
+ edgecolor="#1c1c1c",
933
+ linewidth=2,
934
+ )
935
+ plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns")
936
+ st.pyplot(plt)
937
+ except Exception as e:
938
+ st.write(str(e))
939
+
940
+
941
+ def bibarplot(data, cols):
942
+ try:
943
+ plt.figure(figsize=(10, 8))
944
+ sns.barplot(
945
+ data=data,
946
+ x=cols[0],
947
+ y=cols[1],
948
+ palette=palette[1:],
949
+ edgecolor="#1c1c1c",
950
+ linewidth=2,
951
+ )
952
+ plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns")
953
+ st.pyplot(plt)
954
+ except Exception as e:
955
+ st.write(str(e))
956
+
957
+
958
+ def biboxplot(data, cols):
959
+ try:
960
+ plt.figure(figsize=(10, 8))
961
+ sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2)
962
+ plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns")
963
+ st.pyplot(plt)
964
+ except Exception as e:
965
+ st.write(str(e))
966
+
967
+
968
+ def paretoplot(data, categorical_col):
969
+ try:
970
+ value_counts = data[categorical_col].value_counts()
971
+ cumulative_percentage = (value_counts / value_counts.sum()).cumsum()
972
+ pareto_df = pd.DataFrame(
973
+ {
974
+ "Categories": value_counts.index,
975
+ "Frequency": value_counts.values,
976
+ "Cumulative Percentage": cumulative_percentage.values * 100,
977
+ }
978
+ )
979
+ pareto_df = pareto_df.sort_values(by="Frequency", ascending=False)
980
+
981
+ fig, ax1 = plt.subplots(figsize=(10, 8))
982
+ ax1.bar(
983
+ pareto_df["Categories"],
984
+ pareto_df["Frequency"],
985
+ color=palette[1:],
986
+ edgecolor="#1c1c1c",
987
+ linewidth=2,
988
+ )
989
+ ax2 = ax1.twinx()
990
+ ax2.yaxis.set_major_formatter(PercentFormatter())
991
+ ax2.plot(
992
+ pareto_df["Categories"],
993
+ pareto_df["Cumulative Percentage"],
994
+ color=palette[3],
995
+ marker="D",
996
+ ms=10,
997
+ )
998
+ ax1.set_xlabel(categorical_col)
999
+ ax1.set_ylabel("Frequency", color=palette[0])
1000
+ ax2.set_ylabel("Cumulative Percentage", color=palette[3])
1001
+ st.pyplot(fig)
1002
+
1003
+ except Exception as e:
1004
+ pass
1005
+
1006
+
1007
+ def plot_confusion_matrix(y_true, y_pred, title):
1008
+ cm = confusion_matrix(y_true, y_pred)
1009
+ plt.figure(figsize=(6, 4))
1010
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
1011
+ plt.xlabel("Predicted Label")
1012
+ plt.ylabel("True Label")
1013
+ plt.title(title)
1014
+ st.pyplot(plt)
1015
+
1016
+
1017
+ if __name__ == "__main__":
1018
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ scikit-learn
2
+ numpy
3
+ pandas
4
+ matplotlib
5
+ seaborn
6
+ imblearn
7
+ xgboost
8
+ lightgbm