Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
from sklearn.metrics import mean_absolute_error, mean_squared_error | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
import xgboost as xgb | |
import joblib | |
def load_data(file_path): | |
data = pd.read_csv(file_path) | |
data = data.reset_index(drop=True) | |
data = data.drop(data.columns[0], axis=1) # Drop the Unnamed: 0 column | |
return data | |
def preprocess_data(data, features_to_use, categorical_features, numerical_features): | |
# Preprocessing pipelines for both numeric and categorical features | |
numeric_transformer = Pipeline(steps=[ | |
('scaler', StandardScaler()) | |
]) | |
categorical_transformer = Pipeline(steps=[ | |
('encoder', OneHotEncoder(handle_unknown='ignore')) | |
]) | |
# Combine preprocessing steps | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
('num', numeric_transformer, numerical_features), | |
('cat', categorical_transformer, categorical_features) | |
]) | |
# Split the datasets | |
X = data[features_to_use] | |
y = data['fuel_burn_total'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
return X_train, X_test, y_train, y_test, preprocessor | |
def train_model(X_train, y_train, preprocessor, best_params): | |
# Create the final model pipeline using the best parameters | |
final_pipeline = Pipeline(steps=[ | |
('preprocessor', preprocessor), | |
('model', xgb.XGBRegressor(objective='reg:squarederror', | |
n_estimators=best_params['n_estimators'], | |
max_depth=best_params['max_depth'], | |
learning_rate=best_params['learning_rate'], | |
subsample=best_params['subsample'], | |
random_state=42)) | |
]) | |
# Train the final model on the entire training datasets | |
final_pipeline.fit(X_train, y_train) | |
return final_pipeline | |
def evaluate_model(model, X_test, y_test): | |
y_pred = model.predict(X_test) | |
mae = mean_absolute_error(y_test, y_pred) | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
return mae, rmse | |
def save_model(model, model_path): | |
joblib.dump(model, model_path) | |
def main(): | |
data_file_path = '../../datasets/preprocessed_data.csv' | |
model_file_path = '../../saved_models/xgboost_model.joblib' | |
features_to_use = [ | |
'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', | |
'seats', 'distance', | |
'J/T', 'CAT', 'dist' | |
] | |
# Identify categorical and numerical features | |
categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', | |
'J/T', 'CAT'] | |
numerical_features = [col for col in features_to_use if col not in categorical_features] | |
# Load data | |
data = load_data(data_file_path) | |
# Preprocess the data | |
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features, | |
numerical_features) | |
# best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model. | |
# These hyperparameters were determined through a process of hyperparameter tuning. | |
# | |
# - 'n_estimators': determines the number of boosting rounds or trees to build. | |
# - 'max_depth': Maximum tree depth for base learners. | |
# - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but | |
# can improve generalization. Typical values range from 0.01 to 0.1. | |
# - 'subsample': controls the fraction of observations used for each tree. A smaller subsample value results in | |
# smaller and less complex models, which can help prevent overfitting. | |
best_params = { | |
'n_estimators': 400, | |
'max_depth': 20, | |
'learning_rate': 0.08, | |
'subsample': 0.9, | |
} | |
# Train the model | |
model = train_model(X_train, y_train, preprocessor, best_params) | |
# Evaluate the model | |
mae, rmse = evaluate_model(model, X_test, y_test) | |
print(f'MAE: {mae}') | |
print(f'RMSE: {rmse}') | |
# Save the final model | |
save_model(model, model_file_path) | |
if __name__ == "__main__": | |
main() | |