Spaces:

poudel
/

AircraftFuelPredictorV2

Runtime error

App Files Files Community

AircraftFuelPredictorV2 / models /xgboost /model.py

poudel

Upload 11 files

f637442 verified 27 days ago

raw

history blame

No virus

4.49 kB

	import pandas as pd
	import numpy as np
	from sklearn.metrics import mean_absolute_error, mean_squared_error
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	import xgboost as xgb
	import joblib


	def load_data(file_path):
	data = pd.read_csv(file_path)
	data = data.reset_index(drop=True)
	data = data.drop(data.columns[0], axis=1) # Drop the Unnamed: 0 column
	return data


	def preprocess_data(data, features_to_use, categorical_features, numerical_features):
	# Preprocessing pipelines for both numeric and categorical features
	numeric_transformer = Pipeline(steps=[
	('scaler', StandardScaler())
	])

	categorical_transformer = Pipeline(steps=[
	('encoder', OneHotEncoder(handle_unknown='ignore'))
	])

	# Combine preprocessing steps
	preprocessor = ColumnTransformer(
	transformers=[
	('num', numeric_transformer, numerical_features),
	('cat', categorical_transformer, categorical_features)
	])

	# Split the datasets
	X = data[features_to_use]
	y = data['fuel_burn_total']
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	return X_train, X_test, y_train, y_test, preprocessor


	def train_model(X_train, y_train, preprocessor, best_params):
	# Create the final model pipeline using the best parameters
	final_pipeline = Pipeline(steps=[
	('preprocessor', preprocessor),
	('model', xgb.XGBRegressor(objective='reg:squarederror',
	n_estimators=best_params['n_estimators'],
	max_depth=best_params['max_depth'],
	learning_rate=best_params['learning_rate'],
	subsample=best_params['subsample'],
	random_state=42))
	])

	# Train the final model on the entire training datasets
	final_pipeline.fit(X_train, y_train)

	return final_pipeline


	def evaluate_model(model, X_test, y_test):
	y_pred = model.predict(X_test)

	mae = mean_absolute_error(y_test, y_pred)
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	return mae, rmse


	def save_model(model, model_path):
	joblib.dump(model, model_path)


	def main():
	data_file_path = '../../datasets/preprocessed_data.csv'
	model_file_path = '../../saved_models/xgboost_model.joblib'

	features_to_use = [
	'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
	'seats', 'distance',
	'J/T', 'CAT', 'dist'
	]

	# Identify categorical and numerical features
	categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
	'J/T', 'CAT']
	numerical_features = [col for col in features_to_use if col not in categorical_features]

	# Load data
	data = load_data(data_file_path)

	# Preprocess the data
	X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
	numerical_features)

	# best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
	# These hyperparameters were determined through a process of hyperparameter tuning.
	#
	# - 'n_estimators': determines the number of boosting rounds or trees to build.
	# - 'max_depth': Maximum tree depth for base learners.
	# - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
	# can improve generalization. Typical values range from 0.01 to 0.1.
	# - 'subsample': controls the fraction of observations used for each tree. A smaller subsample value results in
	# smaller and less complex models, which can help prevent overfitting.
	best_params = {
	'n_estimators': 400,
	'max_depth': 20,
	'learning_rate': 0.08,
	'subsample': 0.9,
	}

	# Train the model
	model = train_model(X_train, y_train, preprocessor, best_params)

	# Evaluate the model
	mae, rmse = evaluate_model(model, X_test, y_test)
	print(f'MAE: {mae}')
	print(f'RMSE: {rmse}')
	# Save the final model
	save_model(model, model_file_path)


	if __name__ == "__main__":
	main()