matsammut commited on
Commit
a963fb1
·
verified ·
1 Parent(s): 800145c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -40
app.py CHANGED
@@ -42,11 +42,6 @@ def predict_rf(age, workclass, education, occupation, race, gender, capital_ga
42
  return "Income >50K" if prediction == 1 else "Income <=50K"
43
 
44
  def predict_hb(age, workclass, education, occupation, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
45
- # columns = {
46
- # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation],
47
- # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss],
48
- # "hours-per-week":[hours_per_week], "native-country":[native_country]}
49
-
50
 
51
  columns = {
52
  "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation],
@@ -55,23 +50,23 @@ def predict_hb(age, workclass, education, occupation, race, gender, capital_ga
55
  df = pd.DataFrame(data=columns)
56
  fixed_features = cleaning_features(df,race,True)
57
  print(fixed_features)
58
- hdb_model = pickle.load(open('hdbscan_model.pkl', 'rb'))
59
- prediction = hdb_model.approximate_predict(fixed_features)
60
- # scaler = StandardScaler()
61
- # X = scaler.fit_transform(fixed_features)
62
 
63
- # clusterer = hdbscan.HDBSCAN(
64
- # min_cluster_size=220,
65
- # min_samples=117,
66
- # metric='euclidean',
67
- # cluster_selection_method='eom',
68
- # prediction_data=True,
69
- # cluster_selection_epsilon=0.28479667859306007
70
- # )
71
 
72
- # prediction = clusterer.fit_predict(X)
73
- # filename = 'hdbscan_model.pkl'
74
- # pickle.dump(clusterer, open(filename, 'wb'))
75
 
76
  return f"Predicted Cluster (HDBSCAN): {prediction[-1]}"
77
 
@@ -127,19 +122,14 @@ def cleaning_features(data,race,hdbscan):
127
  data[f'race_{races}'] = 1
128
  else:
129
  data[f'race_{races}'] = 0
130
- # for N in columns_to_encode:
131
- # race_encoded = encoder.transform(data[[N]])
132
- # race_encoded_cols = encoder.get_feature_names_out([N])
133
- # race_encoded_df = pd.DataFrame(race_encoded, columns=race_encoded_cols, index=data.index)
134
- # # Combine the encoded data with original dataframe
135
- # data = pd.concat([data.drop(N, axis=1), race_encoded_df], axis=1)
136
  data = data.drop(columns=['race'])
137
 
138
  data = pca(data)
139
  if(hdbscan):
140
- # df_transformed = pd.read_csv('dataset.csv')
141
- # X = df_transformed.drop('income', axis=1)
142
- # data = pd.concat([X, data], ignore_index=True)
143
  data['capital-gain'] = np.log1p(data['capital-gain'])
144
  data['capital-loss'] = np.log1p(data['capital-loss'])
145
  scaler = joblib.load("robust_scaler.pkl")
@@ -148,17 +138,6 @@ def cleaning_features(data,race,hdbscan):
148
 
149
  return data
150
 
151
- # def pca(data):
152
- # encoder = OneHotEncoder(sparse_output=False)
153
- # one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']])
154
- # encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
155
- # pca_net = PCA(n_components=10)
156
- # pca_result_net = pca_net.fit_transform(encoded_columns_df)
157
- # pca_columns = [f'pca_component_{i+1}' for i in range(10)]
158
- # pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
159
- # data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns
160
- # data = pd.concat([data, pca_df], axis=1)
161
- # return data
162
 
163
 
164
  def pca(data):
 
42
  return "Income >50K" if prediction == 1 else "Income <=50K"
43
 
44
  def predict_hb(age, workclass, education, occupation, race, gender, capital_gain, capital_loss, hours_per_week, native_country):
 
 
 
 
 
45
 
46
  columns = {
47
  "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation],
 
50
  df = pd.DataFrame(data=columns)
51
  fixed_features = cleaning_features(df,race,True)
52
  print(fixed_features)
53
+ # hdb_model = pickle.load(open('hdbscan_model.pkl', 'rb'))
54
+ # prediction = hdb_model.approximate_predict(fixed_features)
55
+ scaler = StandardScaler()
56
+ X = scaler.fit_transform(fixed_features)
57
 
58
+ clusterer = hdbscan.HDBSCAN(
59
+ min_cluster_size=220,
60
+ min_samples=117,
61
+ metric='euclidean',
62
+ cluster_selection_method='eom',
63
+ prediction_data=True,
64
+ cluster_selection_epsilon=0.28479667859306007
65
+ )
66
 
67
+ prediction = clusterer.fit_predict(X)
68
+ filename = 'hdbscan_model.pkl'
69
+ pickle.dump(clusterer, open(filename, 'wb'))
70
 
71
  return f"Predicted Cluster (HDBSCAN): {prediction[-1]}"
72
 
 
122
  data[f'race_{races}'] = 1
123
  else:
124
  data[f'race_{races}'] = 0
125
+
 
 
 
 
 
126
  data = data.drop(columns=['race'])
127
 
128
  data = pca(data)
129
  if(hdbscan):
130
+ df_transformed = pd.read_csv('dataset.csv')
131
+ X = df_transformed.drop('income', axis=1)
132
+ data = pd.concat([X, data], ignore_index=True)
133
  data['capital-gain'] = np.log1p(data['capital-gain'])
134
  data['capital-loss'] = np.log1p(data['capital-loss'])
135
  scaler = joblib.load("robust_scaler.pkl")
 
138
 
139
  return data
140
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
 
143
  def pca(data):