danielperales commited on
Commit
8056499
1 Parent(s): 0836b70

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +330 -0
app.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ import matplotlib as mpl
7
+ import pycaret
8
+ import streamlit as st
9
+ from streamlit_option_menu import option_menu
10
+ import PIL
11
+ from PIL import Image
12
+ from PIL import ImageColor
13
+ from PIL import ImageDraw
14
+ from PIL import ImageFont
15
+
16
+ def main():
17
+ st.set_page_config(layout="wide")
18
+
19
+ hide_streamlit_style = """
20
+ <style>
21
+ #MainMenu {visibility: hidden;}
22
+ footer {visibility: hidden;}
23
+ </style>
24
+ """
25
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
26
+
27
+ with st.sidebar:
28
+ image = Image.open('itaca_logo.png')
29
+ st.image(image, width=150) #,use_column_width=True)
30
+ page = option_menu(menu_title='Menu',
31
+ menu_icon="robot",
32
+ options=["Clustering Analysis",
33
+ "Anomaly Detection"],
34
+ icons=["chat-dots",
35
+ "key"],
36
+ default_index=0
37
+ )
38
+
39
+ # Additional section below the option menu
40
+ # st.markdown("---") # Add a separator line
41
+ st.header("Settings")
42
+
43
+ num_lines = st.number_input("% of lines to be processed:", min_value=0, max_value=100, value=100)
44
+ graph_select = st.checkbox("Show Graphics", value= True)
45
+ feat_imp_select = st.checkbox("Feature Importance", value= False)
46
+
47
+ # Define the options for the dropdown list
48
+ numclusters = [2, 3, 4, 5, 6]
49
+ selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4)
50
+
51
+ p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False)
52
+ p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9)
53
+ # p_remove_outliers = st.checkbox("Remove Outliers", value=False)
54
+ # p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"])
55
+ p_transformation = st.checkbox("Choose Power Transform", value = False)
56
+ p_normalize = st.checkbox("Choose Normalize", value = False)
57
+ p_pca = st.checkbox("Choose PCA", value = False)
58
+ p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"])
59
+
60
+ st.title('ITACA Insurance Core AI Module')
61
+
62
+ #col1, col2 = st.columns(2)
63
+
64
+ if page == "Clustering Analysis":
65
+ #with col1:
66
+ st.header('Clustering Analysis')
67
+
68
+ st.write(
69
+ """
70
+ """
71
+ )
72
+ # import pycaret unsupervised models
73
+ from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
74
+ # import ClusteringExperiment
75
+ from pycaret.clustering import ClusteringExperiment
76
+
77
+ # Display the list of CSV files
78
+ directory = "./"
79
+ all_files = os.listdir(directory)
80
+ # Filter files to only include CSV files
81
+ csv_files = [file for file in all_files if file.endswith(".csv")]
82
+ # Select a CSV file from the list
83
+ selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
84
+
85
+ # Upload the CSV file
86
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
87
+
88
+ # Define the unsupervised model
89
+ clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
90
+ selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
91
+
92
+ # Read and display the CSV file
93
+ if selected_csv != "None" or uploaded_file is not None:
94
+ if uploaded_file:
95
+ try:
96
+ delimiter = ','
97
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
98
+ except ValueError:
99
+ delimiter = '|'
100
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
101
+ else:
102
+ insurance_claims = pd.read_csv(selected_csv)
103
+
104
+ num_rows = int(insurance_claims.shape[0]*(num_lines)/100)
105
+ insurance_claims_reduced = insurance_claims.head(num_rows)
106
+ st.write("Rows to be processed: " + str(num_rows))
107
+
108
+ all_columns = insurance_claims_reduced.columns.tolist()
109
+ selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
110
+ insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
111
+
112
+ with st.expander("Inference Description", expanded=True):
113
+ insurance_claims_reduced.describe().T
114
+
115
+ with st.expander("Head Map", expanded=True):
116
+ cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
117
+ num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
118
+
119
+ # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
120
+ # Calculate the correlation matrix
121
+ corr_matrix = insurance_claims_reduced[num_col].corr()
122
+ # Create a Matplotlib figure
123
+ fig, ax = plt.subplots(figsize=(12, 8))
124
+ # Create a heatmap using seaborn
125
+ #st.header("Heat Map")
126
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
127
+ # Set the title for the heatmap
128
+ ax.set_title('Correlation Heatmap')
129
+ # Display the heatmap in Streamlit
130
+ st.pyplot(fig)
131
+
132
+ if st.button("Prediction"):
133
+ #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
134
+
135
+ s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
136
+ # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
137
+ transformation=p_transformation,
138
+ normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
139
+ exp_clustering = ClusteringExperiment()
140
+ # init setup on exp
141
+ exp_clustering.setup(insurance_claims_reduced, session_id = 123)
142
+
143
+ with st.spinner("Analyzing..."):
144
+ #with col2:
145
+ #st.markdown("<br><br><br><br>", unsafe_allow_html=True)
146
+ # train kmeans model
147
+ cluster_model = create_model(selected_model, num_clusters = selected_clusters)
148
+
149
+ cluster_model_2 = assign_model(cluster_model)
150
+ # Calculate summary statistics for each cluster
151
+ cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
152
+ 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
153
+ ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
154
+
155
+ with st.expander("Cluster Summary", expanded=False):
156
+ #st.header("Cluster Summary")
157
+ cluster_summary
158
+
159
+ with st.expander("Model Assign", expanded=False):
160
+ #st.header("Assign Model")
161
+ cluster_model_2
162
+
163
+ # all_metrics = get_metrics()
164
+ # all_metrics
165
+
166
+ with st.expander("Clustering Metrics", expanded=False):
167
+ #st.header("Clustering Metrics")
168
+ cluster_results = pull()
169
+ cluster_results
170
+
171
+ with st.expander("Clustering Plots", expanded=False):
172
+ if graph_select:
173
+ #st.header("Clustering Plots")
174
+ # plot pca cluster plot
175
+ plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
176
+
177
+ if selected_model != 'ap':
178
+ plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
179
+
180
+ if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
181
+ plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
182
+
183
+ if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
184
+ plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
185
+
186
+ if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
187
+ plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
188
+
189
+ if selected_model != 'ap':
190
+ plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
191
+
192
+ with st.expander("Feature Importance", expanded=False):
193
+ # Create a Classification Model to extract feature importance
194
+ if graph_select and feat_imp_select:
195
+ #st.header("Feature Importance")
196
+ from pycaret.classification import setup, create_model, get_config
197
+ s = setup(cluster_model_2, target = 'Cluster')
198
+ lr = create_model('lr')
199
+
200
+ # this is how you can recreate the table
201
+ feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
202
+ # sort by feature importance value and filter top 10
203
+ feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
204
+ # Display the filtered table in Streamlit
205
+ # st.dataframe(feat_imp)
206
+ # Display the filtered table as a bar chart in Streamlit
207
+ st.bar_chart(feat_imp.set_index('Feature'))
208
+
209
+ elif page == "Anomaly Detection":
210
+ #with col1:
211
+ st.header('Anomaly Detection')
212
+
213
+ st.write(
214
+ """
215
+ """
216
+ )
217
+
218
+ # import pycaret anomaly
219
+ from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model
220
+ # import AnomalyExperiment
221
+ from pycaret.anomaly import AnomalyExperiment
222
+
223
+ # Display the list of CSV files
224
+ directory = "./"
225
+ all_files = os.listdir(directory)
226
+ # Filter files to only include CSV files
227
+ csv_files = [file for file in all_files if file.endswith(".csv")]
228
+ # Select a CSV file from the list
229
+ selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
230
+
231
+ # Upload the CSV file
232
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
233
+
234
+ # Define the unsupervised model
235
+ anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
236
+ selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
237
+
238
+ # Read and display the CSV file
239
+ if selected_csv != "None" or uploaded_file is not None:
240
+ if uploaded_file:
241
+ try:
242
+ delimiter = ','
243
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
244
+ except ValueError:
245
+ delimiter = '|'
246
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
247
+ else:
248
+ insurance_claims = pd.read_csv(selected_csv)
249
+
250
+ num_rows = int(insurance_claims.shape[0]*(num_lines)/100)
251
+ insurance_claims_reduced = insurance_claims.head(num_rows)
252
+ st.write("Rows to be processed: " + str(num_rows))
253
+
254
+ all_columns = insurance_claims_reduced.columns.tolist()
255
+ selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
256
+ insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
257
+
258
+ with st.expander("Inference Description", expanded=True):
259
+ insurance_claims_reduced.describe().T
260
+
261
+ with st.expander("Head Map", expanded=True):
262
+ cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
263
+ num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
264
+
265
+ # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
266
+ # Calculate the correlation matrix
267
+ corr_matrix = insurance_claims_reduced[num_col].corr()
268
+ # Create a Matplotlib figure
269
+ fig, ax = plt.subplots(figsize=(12, 8))
270
+ # Create a heatmap using seaborn
271
+ #st.header("Heat Map")
272
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
273
+ # Set the title for the heatmap
274
+ ax.set_title('Correlation Heatmap')
275
+ # Display the heatmap in Streamlit
276
+ st.pyplot(fig)
277
+
278
+ if st.button("Prediction"):
279
+
280
+ s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
281
+ # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
282
+ transformation=p_transformation,
283
+ normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
284
+
285
+ exp_anomaly = AnomalyExperiment()
286
+ # init setup on exp
287
+ exp_anomaly.setup(insurance_claims_reduced, session_id = 123)
288
+
289
+ with st.spinner("Analyzing..."):
290
+ #with col2:
291
+ #st.markdown("<br><br><br><br>", unsafe_allow_html=True)
292
+ # train model
293
+ anomaly_model = create_model(selected_model)
294
+
295
+ with st.expander("Assign Model", expanded=False):
296
+ #st.header("Assign Model")
297
+ anomaly_model_2 = assign_model(anomaly_model)
298
+ anomaly_model_2
299
+
300
+ with st.expander("Anomaly Metrics", expanded=False):
301
+ #st.header("Anomaly Metrics")
302
+ anomaly_results = pull()
303
+ anomaly_results
304
+
305
+ with st.expander("Anomaly Plots", expanded=False):
306
+ if graph_select:
307
+ # plot
308
+ #st.header("Anomaly Plots")
309
+ plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
310
+ plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
311
+
312
+ with st.expander("Feature Importance", expanded=False):
313
+ if graph_select and feat_imp_select:
314
+ # Create a Classification Model to extract feature importance
315
+ #st.header("Feature Importance")
316
+ from pycaret.classification import setup, create_model, get_config
317
+ s = setup(anomaly_model_2, target = 'Anomaly')
318
+ lr = create_model('lr')
319
+ # this is how you can recreate the table
320
+ feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
321
+ # sort by feature importance value and filter top 10
322
+ feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
323
+ # Display the filtered table in Streamlit
324
+ # st.dataframe(feat_imp)
325
+ # Display the filtered table as a bar chart in Streamlit
326
+ st.bar_chart(feat_imp.set_index('Feature'))
327
+ try:
328
+ main()
329
+ except Exception as e:
330
+ st.sidebar.error(f"An error occurred: {e}")