|
|
|
|
|
from sklearn.ensemble import GradientBoostingClassifier |
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import classification_report |
|
from sklearn import metrics |
|
|
|
import pandas as pd, numpy as np |
|
import os |
|
import yaml |
|
import sys |
|
|
|
|
|
def transform_df(df): |
|
for col in df.columns: |
|
if df[col].dtype == 'object': |
|
df[col] = df[col].astype('category') |
|
|
|
for col in df.columns: |
|
if pd.api.types.is_categorical_dtype(df[col]): |
|
df[col] = df[col].cat.codes |
|
return df |
|
|
|
X_train = pd.read_csv(f'{sys.argv[1]}', index_col=0) |
|
X_train = transform_df(X_train) |
|
|
|
y_train = X_train['Class'].astype(int) |
|
X_train.drop(columns=['Class'], inplace=True) |
|
X_test = pd.read_csv(f'{sys.argv[2]}', index_col=0) |
|
X_test = transform_df(X_test) |
|
|
|
y_test = X_test['Class'].astype(int) |
|
X_test.drop(columns=['Class'], inplace=True) |
|
|
|
columns_to_drop = [col for col in X_test.columns if col not in X_train.columns] |
|
|
|
X_test.drop(columns=columns_to_drop, inplace=True) |
|
|
|
gbm_classifier = GradientBoostingClassifier(random_state=0).fit(X_train, y_train) |
|
|
|
|
|
fpr, tpr, thresholds = metrics.roc_curve(y_test.astype(float), |
|
gbm_classifier.predict_proba(X_test)[:,1], |
|
pos_label=1) |
|
print("AUC={:.9f}".format(metrics.auc(fpr, tpr))) |
|
|
|
|
|
|