Spaces:
Runtime error
Runtime error
Delete xgboost
Browse files- xgboost/gradient_boosting_regressor.py +0 -240
- xgboost/inference.py +0 -58
- xgboost/model.py +0 -125
xgboost/gradient_boosting_regressor.py
DELETED
@@ -1,240 +0,0 @@
|
|
1 |
-
# # -*- coding: utf-8 -*-
|
2 |
-
# """gradient_boosting_regressor.ipynb
|
3 |
-
|
4 |
-
# Automatically generated by Colab.
|
5 |
-
|
6 |
-
# Original file is located at
|
7 |
-
# https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
|
8 |
-
# """
|
9 |
-
|
10 |
-
# import pandas as pd
|
11 |
-
# import requests
|
12 |
-
# import numpy as np
|
13 |
-
# from sklearn.linear_model import LinearRegression
|
14 |
-
# from sklearn.model_selection import train_test_split
|
15 |
-
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
16 |
-
# from sklearn.ensemble import GradientBoostingRegressor
|
17 |
-
# from sklearn.tree import DecisionTreeRegressor
|
18 |
-
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
19 |
-
|
20 |
-
# # the dataset I am using is from RapidApi
|
21 |
-
# api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
|
22 |
-
# url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
|
23 |
-
# headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
|
24 |
-
# response = requests.get(url, headers = headers)
|
25 |
-
# if response.status_code == 200:
|
26 |
-
# data = response.json()
|
27 |
-
# print(data)
|
28 |
-
# else:
|
29 |
-
# print({response.status_code}, {response.text})
|
30 |
-
# # Note climbData and descendData is not being used since there is only one key entry for both features
|
31 |
-
|
32 |
-
# # Gradient Boosting Regressor
|
33 |
-
# # In here Im using the same .json dataset with a new model Gradient Boosting Regressor
|
34 |
-
|
35 |
-
# data = response.json()
|
36 |
-
# features = [feature['properties'] for feature in data['features']]
|
37 |
-
# df = pd.DataFrame(features) # extracting features for the model
|
38 |
-
|
39 |
-
# #print(df.columns)
|
40 |
-
|
41 |
-
# # numeric
|
42 |
-
# df['dist_km'] = pd.to_numeric(df['dist_km'], errors = 'coerce')
|
43 |
-
# df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors = 'coerce')
|
44 |
-
# df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors = 'coerce')
|
45 |
-
# df['fuel'] = pd.to_numeric(df['fuel'], errors = 'coerce')
|
46 |
-
# df['CO2'] = pd.to_numeric(df['CO2'], errors = 'coerce')
|
47 |
-
|
48 |
-
# df.dropna(inplace = True)
|
49 |
-
# features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']] # you can play with this and add more features I kept it simple with what I know is important
|
50 |
-
# target = df['fuel']
|
51 |
-
|
52 |
-
# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 42) # split into train and test
|
53 |
-
|
54 |
-
# model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 25, max_depth = 5, random_state = 42) # can play with the hyperparameters and observe model metrics
|
55 |
-
# model.fit(features_train, target_train) # fitting the model
|
56 |
-
# target_prediction = model.predict(features_test) # predictions
|
57 |
-
|
58 |
-
# mse = mean_squared_error(target_test, target_prediction)
|
59 |
-
# r2 = r2_score(target_test, target_prediction)
|
60 |
-
# mae = mean_absolute_error(target_test, target_prediction)
|
61 |
-
# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
|
62 |
-
|
63 |
-
|
64 |
-
# feature_we_want = len(target) # what we are looking for
|
65 |
-
# regression = 1 # there is only one predictor
|
66 |
-
# residual = feature_we_want - 2
|
67 |
-
# explained_variance = r2 * np.sum((target - np.mean(target))**2)
|
68 |
-
# unexplained_variance = mse * feature_we_want
|
69 |
-
|
70 |
-
# F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
|
71 |
-
# p_value = 1 - f.cdf(F_value, regression, residual)
|
72 |
-
# rse = np.sqrt(mse)
|
73 |
-
|
74 |
-
# future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92] # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
|
75 |
-
# predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
|
76 |
-
# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean() # can change to "dist_km" to see the average in km
|
77 |
-
|
78 |
-
# print(f"mean squared error: {mse}") # checking the model perfomance
|
79 |
-
# print(f"R-squared: {r2}")
|
80 |
-
# print(f"mean absolute error: {mae}")
|
81 |
-
# print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for XGBoost model")
|
82 |
-
# print(f"regression: {regression:.4f}")
|
83 |
-
# print(f"residual: {residual:.4f}")
|
84 |
-
# print(f"p-value: {p_value:.4f}") # calculating P value for the report
|
85 |
-
# print(f"standard error: {rse:.2f}")
|
86 |
-
# print(f"f-statistic: {F_value:.2f}")
|
87 |
-
# print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
|
88 |
-
|
89 |
-
# # seems like the mse is verrryyy highhhhhhh but this chnages if we add or take off features
|
90 |
-
# # the Rsquare and mae have same numbers as the linear resseion model so thats good
|
91 |
-
|
92 |
-
# # added more features I am now playing with the hyperparameters the metrics go up and down based of the hyperparameters
|
93 |
-
|
94 |
-
# # mse really high, this is a bad model, rquare is a negative number
|
95 |
-
|
96 |
-
# import pandas as pd
|
97 |
-
# import requests
|
98 |
-
# import numpy as np
|
99 |
-
# from sklearn.model_selection import train_test_split
|
100 |
-
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
101 |
-
# from sklearn.ensemble import GradientBoostingRegressor
|
102 |
-
# from sklearn.preprocessing import StandardScaler
|
103 |
-
|
104 |
-
# # Load data from API
|
105 |
-
# api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
|
106 |
-
# url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
|
107 |
-
# headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
|
108 |
-
# response = requests.get(url, headers=headers)
|
109 |
-
|
110 |
-
# if response.status_code == 200:
|
111 |
-
# data = response.json()
|
112 |
-
# else:
|
113 |
-
# print(f"Error {response.status_code}: {response.text}")
|
114 |
-
|
115 |
-
# # Extract features
|
116 |
-
# features = [feature['properties'] for feature in data['features']]
|
117 |
-
# df = pd.DataFrame(features)
|
118 |
-
|
119 |
-
# # Convert relevant columns to numeric
|
120 |
-
# df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
|
121 |
-
# df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
|
122 |
-
# df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
|
123 |
-
# df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
|
124 |
-
# df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
|
125 |
-
|
126 |
-
# df.dropna(inplace=True)
|
127 |
-
|
128 |
-
# # Define features and target
|
129 |
-
# features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
|
130 |
-
# target = df['fuel']
|
131 |
-
|
132 |
-
# # Split the data
|
133 |
-
# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
134 |
-
|
135 |
-
# # Gradient Boosting Regressor
|
136 |
-
# model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
|
137 |
-
# model.fit(features_train, target_train)
|
138 |
-
# target_prediction = model.predict(features_test)
|
139 |
-
|
140 |
-
# # Evaluate model performance
|
141 |
-
# mse = mean_squared_error(target_test, target_prediction)
|
142 |
-
# r2 = r2_score(target_test, target_prediction)
|
143 |
-
# mae = mean_absolute_error(target_test, target_prediction)
|
144 |
-
# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
|
145 |
-
|
146 |
-
# # Future predictions
|
147 |
-
# future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
|
148 |
-
# predicted_fuel_future = model.predict([future_distance_nm])
|
149 |
-
|
150 |
-
# # Print the results
|
151 |
-
# print(f"Mean Squared Error: {mse}")
|
152 |
-
# print(f"R-squared: {r2}")
|
153 |
-
# print(f"Mean Absolute Error: {mae}")
|
154 |
-
# print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
|
155 |
-
# print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
|
156 |
-
|
157 |
-
# # Comment on performance
|
158 |
-
# if mse > 1000: # Threshold can be adjusted
|
159 |
-
# print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
|
160 |
-
# if r2 < 0:
|
161 |
-
# print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
|
162 |
-
|
163 |
-
import pandas as pd
|
164 |
-
import requests
|
165 |
-
import numpy as np
|
166 |
-
from sklearn.model_selection import train_test_split
|
167 |
-
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
168 |
-
from sklearn.ensemble import GradientBoostingRegressor
|
169 |
-
from scipy.stats import f # Importing the F-distribution
|
170 |
-
|
171 |
-
# Load data from API
|
172 |
-
api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
|
173 |
-
url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
|
174 |
-
headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
|
175 |
-
response = requests.get(url, headers=headers)
|
176 |
-
|
177 |
-
if response.status_code == 200:
|
178 |
-
data = response.json()
|
179 |
-
else:
|
180 |
-
print(f"Error {response.status_code}: {response.text}")
|
181 |
-
|
182 |
-
# Extract features
|
183 |
-
features = [feature['properties'] for feature in data['features']]
|
184 |
-
df = pd.DataFrame(features)
|
185 |
-
|
186 |
-
# Convert relevant columns to numeric
|
187 |
-
df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
|
188 |
-
df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
|
189 |
-
df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
|
190 |
-
df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
|
191 |
-
df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
|
192 |
-
|
193 |
-
df.dropna(inplace=True)
|
194 |
-
|
195 |
-
# Define features and target
|
196 |
-
features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
|
197 |
-
target = df['fuel']
|
198 |
-
|
199 |
-
# Split the data
|
200 |
-
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
201 |
-
|
202 |
-
# Gradient Boosting Regressor
|
203 |
-
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
|
204 |
-
model.fit(features_train, target_train)
|
205 |
-
target_prediction = model.predict(features_test)
|
206 |
-
|
207 |
-
# Evaluate model performance
|
208 |
-
mse = mean_squared_error(target_test, target_prediction)
|
209 |
-
r2 = r2_score(target_test, target_prediction)
|
210 |
-
mae = mean_absolute_error(target_test, target_prediction)
|
211 |
-
average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
|
212 |
-
|
213 |
-
# Future predictions
|
214 |
-
future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
|
215 |
-
predicted_fuel_future = model.predict([future_distance_nm])
|
216 |
-
|
217 |
-
# Calculate F-statistic and p-value (if necessary)
|
218 |
-
n_samples = len(target)
|
219 |
-
n_predictors = features_train.shape[1]
|
220 |
-
residual = n_samples - n_predictors - 1
|
221 |
-
explained_variance = r2 * np.sum((target - np.mean(target))**2)
|
222 |
-
unexplained_variance = mse * n_samples
|
223 |
-
|
224 |
-
F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
|
225 |
-
p_value = 1 - f.cdf(F_value, n_predictors, residual)
|
226 |
-
|
227 |
-
# Print the results
|
228 |
-
print(f"Mean Squared Error: {mse}")
|
229 |
-
print(f"R-squared: {r2}")
|
230 |
-
print(f"Mean Absolute Error: {mae}")
|
231 |
-
print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
|
232 |
-
print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
|
233 |
-
print(f"p-value: {p_value:.4f}")
|
234 |
-
print(f"F-statistic: {F_value:.2f}")
|
235 |
-
|
236 |
-
# Comment on performance
|
237 |
-
if mse > 1000: # Threshold can be adjusted
|
238 |
-
print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
|
239 |
-
if r2 < 0:
|
240 |
-
print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xgboost/inference.py
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import joblib
|
3 |
-
|
4 |
-
|
5 |
-
def load_data(file_path):
|
6 |
-
df = pd.read_csv(file_path)
|
7 |
-
df = df.drop(df.columns[0], axis=1) # Drop the Unnamed: 0 column (Index column)
|
8 |
-
return df
|
9 |
-
|
10 |
-
|
11 |
-
def load_model(model_path):
|
12 |
-
return joblib.load(model_path)
|
13 |
-
|
14 |
-
|
15 |
-
def evaluate_model(df, model, selected_features, batch_size=100):
|
16 |
-
total_accuracy = 0
|
17 |
-
num_rows = len(df)
|
18 |
-
|
19 |
-
for start in range(0, num_rows, batch_size):
|
20 |
-
end = min(start + batch_size, num_rows)
|
21 |
-
batch_df = df.iloc[start:end]
|
22 |
-
|
23 |
-
fuel_burn_total = batch_df.pop('fuel_burn_total').values
|
24 |
-
batch_df = batch_df[selected_features]
|
25 |
-
|
26 |
-
predictions = model.predict(batch_df)
|
27 |
-
|
28 |
-
# Calculate accuracy for the current batch
|
29 |
-
accuracy = 1 - abs(fuel_burn_total - predictions) / fuel_burn_total
|
30 |
-
batch_accuracy = accuracy.mean()
|
31 |
-
total_accuracy += batch_accuracy * len(batch_df)
|
32 |
-
|
33 |
-
print(f'Processed rows {start + 1} to {end} out of {num_rows} rows')
|
34 |
-
|
35 |
-
average_accuracy = total_accuracy / num_rows
|
36 |
-
return average_accuracy
|
37 |
-
|
38 |
-
|
39 |
-
def main():
|
40 |
-
data_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/test.csv'
|
41 |
-
model_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/xgboost_model.joblib'
|
42 |
-
|
43 |
-
selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
|
44 |
-
'CAT', 'dist']
|
45 |
-
|
46 |
-
# Load data and model
|
47 |
-
df = load_data(data_file_path)
|
48 |
-
model = load_model(model_file_path)
|
49 |
-
|
50 |
-
# Evaluate the model
|
51 |
-
average_accuracy = evaluate_model(df, model, selected_features)
|
52 |
-
|
53 |
-
# Print the average accuracy
|
54 |
-
print(f'Average Accuracy: {average_accuracy:.2%}')
|
55 |
-
|
56 |
-
|
57 |
-
if __name__ == "__main__":
|
58 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xgboost/model.py
DELETED
@@ -1,125 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
4 |
-
from sklearn.model_selection import train_test_split
|
5 |
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
6 |
-
from sklearn.compose import ColumnTransformer
|
7 |
-
from sklearn.pipeline import Pipeline
|
8 |
-
import xgboost as xgb
|
9 |
-
import joblib
|
10 |
-
|
11 |
-
|
12 |
-
def load_data(file_path):
|
13 |
-
data = pd.read_csv(file_path)
|
14 |
-
data = data.reset_index(drop=True)
|
15 |
-
data = data.drop(data.columns[0], axis=1) # Drop the Unnamed: 0 column
|
16 |
-
return data
|
17 |
-
|
18 |
-
|
19 |
-
def preprocess_data(data, features_to_use, categorical_features, numerical_features):
|
20 |
-
# Preprocessing pipelines for both numeric and categorical features
|
21 |
-
numeric_transformer = Pipeline(steps=[
|
22 |
-
('scaler', StandardScaler())
|
23 |
-
])
|
24 |
-
|
25 |
-
categorical_transformer = Pipeline(steps=[
|
26 |
-
('encoder', OneHotEncoder(handle_unknown='ignore'))
|
27 |
-
])
|
28 |
-
|
29 |
-
# Combine preprocessing steps
|
30 |
-
preprocessor = ColumnTransformer(
|
31 |
-
transformers=[
|
32 |
-
('num', numeric_transformer, numerical_features),
|
33 |
-
('cat', categorical_transformer, categorical_features)
|
34 |
-
])
|
35 |
-
|
36 |
-
# Split the datasets
|
37 |
-
X = data[features_to_use]
|
38 |
-
y = data['fuel_burn_total']
|
39 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
40 |
-
|
41 |
-
return X_train, X_test, y_train, y_test, preprocessor
|
42 |
-
|
43 |
-
|
44 |
-
def train_model(X_train, y_train, preprocessor, best_params):
|
45 |
-
# Create the final model pipeline using the best parameters
|
46 |
-
final_pipeline = Pipeline(steps=[
|
47 |
-
('preprocessor', preprocessor),
|
48 |
-
('model', xgb.XGBRegressor(objective='reg:squarederror',
|
49 |
-
n_estimators=best_params['n_estimators'],
|
50 |
-
max_depth=best_params['max_depth'],
|
51 |
-
learning_rate=best_params['learning_rate'],
|
52 |
-
subsample=best_params['subsample'],
|
53 |
-
random_state=42))
|
54 |
-
])
|
55 |
-
|
56 |
-
# Train the final model on the entire training datasets
|
57 |
-
final_pipeline.fit(X_train, y_train)
|
58 |
-
|
59 |
-
return final_pipeline
|
60 |
-
|
61 |
-
|
62 |
-
def evaluate_model(model, X_test, y_test):
|
63 |
-
y_pred = model.predict(X_test)
|
64 |
-
|
65 |
-
mae = mean_absolute_error(y_test, y_pred)
|
66 |
-
mse = mean_squared_error(y_test, y_pred)
|
67 |
-
rmse = np.sqrt(mse)
|
68 |
-
return mae, rmse
|
69 |
-
|
70 |
-
|
71 |
-
def save_model(model, model_path):
|
72 |
-
joblib.dump(model, model_path)
|
73 |
-
|
74 |
-
|
75 |
-
def main():
|
76 |
-
data_file_path = '../../datasets/preprocessed_data.csv'
|
77 |
-
model_file_path = '../../saved_models/xgboost_model.joblib'
|
78 |
-
|
79 |
-
features_to_use = [
|
80 |
-
'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
|
81 |
-
'seats', 'distance',
|
82 |
-
'J/T', 'CAT', 'dist'
|
83 |
-
]
|
84 |
-
|
85 |
-
# Identify categorical and numerical features
|
86 |
-
categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
|
87 |
-
'J/T', 'CAT']
|
88 |
-
numerical_features = [col for col in features_to_use if col not in categorical_features]
|
89 |
-
|
90 |
-
# Load data
|
91 |
-
data = load_data(data_file_path)
|
92 |
-
|
93 |
-
# Preprocess the data
|
94 |
-
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
|
95 |
-
numerical_features)
|
96 |
-
|
97 |
-
# best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
|
98 |
-
# These hyperparameters were determined through a process of hyperparameter tuning.
|
99 |
-
#
|
100 |
-
# - 'n_estimators': determines the number of boosting rounds or trees to build.
|
101 |
-
# - 'max_depth': Maximum tree depth for base learners.
|
102 |
-
# - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
|
103 |
-
# can improve generalization. Typical values range from 0.01 to 0.1.
|
104 |
-
# - 'subsample': controls the fraction of observations used for each tree. A smaller subsample value results in
|
105 |
-
# smaller and less complex models, which can help prevent overfitting.
|
106 |
-
best_params = {
|
107 |
-
'n_estimators': 400,
|
108 |
-
'max_depth': 20,
|
109 |
-
'learning_rate': 0.08,
|
110 |
-
'subsample': 0.9,
|
111 |
-
}
|
112 |
-
|
113 |
-
# Train the model
|
114 |
-
model = train_model(X_train, y_train, preprocessor, best_params)
|
115 |
-
|
116 |
-
# Evaluate the model
|
117 |
-
mae, rmse = evaluate_model(model, X_test, y_test)
|
118 |
-
print(f'MAE: {mae}')
|
119 |
-
print(f'RMSE: {rmse}')
|
120 |
-
# Save the final model
|
121 |
-
save_model(model, model_file_path)
|
122 |
-
|
123 |
-
|
124 |
-
if __name__ == "__main__":
|
125 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|