File size: 4,488 Bytes
f637442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import joblib


def load_data(file_path):
    data = pd.read_csv(file_path)
    data = data.reset_index(drop=True)
    data = data.drop(data.columns[0], axis=1)  # Drop the Unnamed: 0 column
    return data


def preprocess_data(data, features_to_use, categorical_features, numerical_features):
    # Preprocessing pipelines for both numeric and categorical features
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Split the datasets
    X = data[features_to_use]
    y = data['fuel_burn_total']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, preprocessor


def train_model(X_train, y_train, preprocessor, best_params):
    # Create the final model pipeline using the best parameters
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', xgb.XGBRegressor(objective='reg:squarederror',
                                   n_estimators=best_params['n_estimators'],
                                   max_depth=best_params['max_depth'],
                                   learning_rate=best_params['learning_rate'],
                                   subsample=best_params['subsample'],
                                   random_state=42))
    ])

    # Train the final model on the entire training datasets
    final_pipeline.fit(X_train, y_train)

    return final_pipeline


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return mae, rmse


def save_model(model, model_path):
    joblib.dump(model, model_path)


def main():
    data_file_path = '../../datasets/preprocessed_data.csv'
    model_file_path = '../../saved_models/xgboost_model.joblib'

    features_to_use = [
        'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
        'seats', 'distance',
        'J/T', 'CAT', 'dist'
    ]

    # Identify categorical and numerical features
    categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
                            'J/T', 'CAT']
    numerical_features = [col for col in features_to_use if col not in categorical_features]

    # Load data
    data = load_data(data_file_path)

    # Preprocess the data
    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
                                                                     numerical_features)

    # best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
    # These hyperparameters were determined through a process of hyperparameter tuning.
    #
    # - 'n_estimators': determines the number of boosting rounds or trees to build.
    # - 'max_depth': Maximum tree depth for base learners.
    # - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
    #       can improve generalization. Typical values range from 0.01 to 0.1.
    # - 'subsample':  controls the fraction of observations used for each tree. A smaller subsample value results in
    #       smaller and less complex models, which can help prevent overfitting.
    best_params = {
        'n_estimators': 400,
        'max_depth': 20,
        'learning_rate': 0.08,
        'subsample': 0.9,
    }

    # Train the model
    model = train_model(X_train, y_train, preprocessor, best_params)

    # Evaluate the model
    mae, rmse = evaluate_model(model, X_test, y_test)
    print(f'MAE: {mae}')
    print(f'RMSE: {rmse}')
    # Save the final model
    save_model(model, model_file_path)


if __name__ == "__main__":
    main()