Spaces:
Sleeping
Sleeping
# Imports | |
import pickle | |
import os | |
from sklearn.metrics import classification_report, ConfusionMatrixDisplay | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.impute import SimpleImputer, KNNImputer | |
from sklearn.pipeline import Pipeline | |
from sklearn.compose import ColumnTransformer | |
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder | |
from sklearn.model_selection import train_test_split | |
import pandas as pd | |
from ydata_profiling import ProfileReport | |
from sklearn import datasets | |
from subprocess import call | |
# PATHS | |
DIRPATH = os.path.dirname(os.path.realpath(__file__)) | |
ml_fp = os.path.join(DIRPATH, "assets", "ml", "ml_components.pkl") | |
req_fp = os.path.join(DIRPATH, "assets", "ml", "requirements.txt") | |
eda_report_fp = os.path.join(DIRPATH, "assets", "ml", "eda-report.html") | |
# import some data to play with | |
iris = datasets.load_iris(return_X_y=False, as_frame=True) | |
df = iris['frame'] | |
target_col = 'target' | |
# pandas profiling | |
profile = ProfileReport(df, title="Dataset", html={ | |
'style': {'full_width': True}}) | |
profile.to_file(eda_report_fp) | |
# Dataset Splitting | |
# Please specify | |
to_ignore_cols = [ | |
"ID", # ID | |
"Id", "id", | |
target_col | |
] | |
num_cols = list(set(df.select_dtypes('number')) - set(to_ignore_cols)) | |
cat_cols = list(set(df.select_dtypes(exclude='number')) - set(to_ignore_cols)) | |
print(f"\n[Info] The '{len(num_cols)}' numeric columns are : {num_cols}\nThe '{len(cat_cols)}' categorical columns are : {cat_cols}") | |
X, y = df.iloc[:, :-1], df.iloc[:, -1].values | |
X_train, X_eval, y_train, y_eval = train_test_split( | |
X, y, test_size=0.2, random_state=0, stratify=y) | |
print( | |
f"\n[Info] Dataset splitted : (X_train , y_train) = {(X_train.shape , y_train.shape)}, (X_eval y_eval) = {(X_eval.shape , y_eval.shape)}. \n") | |
y_train | |
# Modeling | |
# Imputers | |
num_imputer = SimpleImputer(strategy="mean").set_output(transform="pandas") | |
cat_imputer = SimpleImputer( | |
strategy="most_frequent").set_output(transform="pandas") | |
# Scaler & Encoder | |
if len(cat_cols) > 0: | |
df_imputed_stacked_cat = cat_imputer.fit_transform( | |
df | |
.append(df) | |
.append(df) | |
[cat_cols]) | |
cat_ = OneHotEncoder(sparse=False, drop="first").fit( | |
df_imputed_stacked_cat).categories_ | |
else: | |
cat_ = 'auto' | |
encoder = OneHotEncoder(categories=cat_, sparse=False, drop="first") | |
scaler = StandardScaler().set_output(transform="pandas") | |
# feature pipelines | |
num_pipe = Pipeline(steps=[("num_imputer", num_imputer), ("scaler", scaler)]) | |
cat_pipe = Pipeline(steps=[("cat_imputer", cat_imputer), ("encoder", encoder)]) | |
# end2end features preprocessor | |
transformers = [] | |
transformers.append(("numerical", num_pipe, num_cols)) if len( | |
num_cols) > 0 else None | |
transformers.append(("categorical", cat_pipe, cat_cols,)) if len( | |
cat_cols) > 0 else None | |
# ("date", date_pipe, date_cols,), | |
preprocessor = ColumnTransformer( | |
transformers=transformers).set_output(transform="pandas") | |
print( | |
f"\n[Info] Features Transformer : {transformers}. \n") | |
# end2end pipeline | |
end2end_pipeline = Pipeline([ | |
('preprocessor', preprocessor), | |
('model', RandomForestClassifier(random_state=10)) | |
]).set_output(transform="pandas") | |
# Training | |
print( | |
f"\n[Info] Training.\n[Info] X_train : columns( {X_train.columns.tolist()}), shape: {X_train.shape} .\n") | |
end2end_pipeline.fit(X_train, y_train) | |
# Evaluation | |
print( | |
f"\n[Info] Evaluation.\n") | |
y_eval_pred = end2end_pipeline.predict(X_eval) | |
print(classification_report(y_eval, y_eval_pred, | |
target_names=iris['target_names'])) | |
# ConfusionMatrixDisplay.from_predictions( | |
# y_eval, y_eval_pred, display_labels=iris['target_names']) | |
# Exportation | |
print( | |
f"\n[Info] Exportation.\n") | |
to_export = { | |
"labels": iris['target_names'], | |
"pipeline": end2end_pipeline, | |
} | |
# save components to file | |
with open(ml_fp, 'wb') as file: | |
pickle.dump(to_export, file) | |
# Requirements | |
# ! pip freeze > requirements.txt | |
call(f"pip freeze > {req_fp}", shell=True) | |