poudel's picture
Upload 11 files
f637442 verified
raw
history blame
No virus
4.49 kB
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import joblib
def load_data(file_path):
data = pd.read_csv(file_path)
data = data.reset_index(drop=True)
data = data.drop(data.columns[0], axis=1) # Drop the Unnamed: 0 column
return data
def preprocess_data(data, features_to_use, categorical_features, numerical_features):
# Preprocessing pipelines for both numeric and categorical features
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
# Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
# Split the datasets
X = data[features_to_use]
y = data['fuel_burn_total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test, preprocessor
def train_model(X_train, y_train, preprocessor, best_params):
# Create the final model pipeline using the best parameters
final_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', xgb.XGBRegressor(objective='reg:squarederror',
n_estimators=best_params['n_estimators'],
max_depth=best_params['max_depth'],
learning_rate=best_params['learning_rate'],
subsample=best_params['subsample'],
random_state=42))
])
# Train the final model on the entire training datasets
final_pipeline.fit(X_train, y_train)
return final_pipeline
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
return mae, rmse
def save_model(model, model_path):
joblib.dump(model, model_path)
def main():
data_file_path = '../../datasets/preprocessed_data.csv'
model_file_path = '../../saved_models/xgboost_model.joblib'
features_to_use = [
'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
'seats', 'distance',
'J/T', 'CAT', 'dist'
]
# Identify categorical and numerical features
categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
'J/T', 'CAT']
numerical_features = [col for col in features_to_use if col not in categorical_features]
# Load data
data = load_data(data_file_path)
# Preprocess the data
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
numerical_features)
# best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
# These hyperparameters were determined through a process of hyperparameter tuning.
#
# - 'n_estimators': determines the number of boosting rounds or trees to build.
# - 'max_depth': Maximum tree depth for base learners.
# - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
# can improve generalization. Typical values range from 0.01 to 0.1.
# - 'subsample': controls the fraction of observations used for each tree. A smaller subsample value results in
# smaller and less complex models, which can help prevent overfitting.
best_params = {
'n_estimators': 400,
'max_depth': 20,
'learning_rate': 0.08,
'subsample': 0.9,
}
# Train the model
model = train_model(X_train, y_train, preprocessor, best_params)
# Evaluate the model
mae, rmse = evaluate_model(model, X_test, y_test)
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
# Save the final model
save_model(model, model_file_path)
if __name__ == "__main__":
main()