Spaces:

poudel
/

AircraftFuelPredictorV2

Runtime error

File size: 4,488 Bytes

f637442

import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import joblib


def load_data(file_path):
    data = pd.read_csv(file_path)
    data = data.reset_index(drop=True)
    data = data.drop(data.columns[0], axis=1)  # Drop the Unnamed: 0 column
    return data


def preprocess_data(data, features_to_use, categorical_features, numerical_features):
    # Preprocessing pipelines for both numeric and categorical features
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Split the datasets
    X = data[features_to_use]
    y = data['fuel_burn_total']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, preprocessor


def train_model(X_train, y_train, preprocessor, best_params):
    # Create the final model pipeline using the best parameters
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', xgb.XGBRegressor(objective='reg:squarederror',
                                   n_estimators=best_params['n_estimators'],
                                   max_depth=best_params['max_depth'],
                                   learning_rate=best_params['learning_rate'],
                                   subsample=best_params['subsample'],
                                   random_state=42))
    ])

    # Train the final model on the entire training datasets
    final_pipeline.fit(X_train, y_train)

    return final_pipeline


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return mae, rmse


def save_model(model, model_path):
    joblib.dump(model, model_path)


def main():
    data_file_path = '../../datasets/preprocessed_data.csv'
    model_file_path = '../../saved_models/xgboost_model.joblib'

    features_to_use = [
        'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
        'seats', 'distance',
        'J/T', 'CAT', 'dist'
    ]

    # Identify categorical and numerical features
    categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
                            'J/T', 'CAT']
    numerical_features = [col for col in features_to_use if col not in categorical_features]

    # Load data
    data = load_data(data_file_path)

    # Preprocess the data
    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
                                                                     numerical_features)

    # best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
    # These hyperparameters were determined through a process of hyperparameter tuning.
    #
    # - 'n_estimators': determines the number of boosting rounds or trees to build.
    # - 'max_depth': Maximum tree depth for base learners.
    # - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
    #       can improve generalization. Typical values range from 0.01 to 0.1.
    # - 'subsample':  controls the fraction of observations used for each tree. A smaller subsample value results in
    #       smaller and less complex models, which can help prevent overfitting.
    best_params = {
        'n_estimators': 400,
        'max_depth': 20,
        'learning_rate': 0.08,
        'subsample': 0.9,
    }

    # Train the model
    model = train_model(X_train, y_train, preprocessor, best_params)

    # Evaluate the model
    mae, rmse = evaluate_model(model, X_test, y_test)
    print(f'MAE: {mae}')
    print(f'RMSE: {rmse}')
    # Save the final model
    save_model(model, model_file_path)


if __name__ == "__main__":
    main()