Spaces:
Runtime error
Runtime error
Upload 11 files
Browse files- models/.DS_Store +0 -0
- models/decision_tree_regression/decision_tree_regressor.py +86 -0
- models/linear_regression/linear_regression.py +140 -0
- models/neural_network/__pycache__/inference.cpython-39.pyc +0 -0
- models/neural_network/inference.py +76 -0
- models/neural_network/model.py +118 -0
- models/neural_network/neural_network.py +161 -0
- models/neural_network/test.ipynb +264 -0
- models/xgboost/gradient_boosting_regressor.py +240 -0
- models/xgboost/inference.py +58 -0
- models/xgboost/model.py +125 -0
models/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
models/decision_tree_regression/decision_tree_regressor.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""decision_tree_regressor.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
|
8 |
+
"""
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import requests
|
12 |
+
import numpy as np
|
13 |
+
from sklearn.linear_model import LinearRegression
|
14 |
+
from sklearn.model_selection import train_test_split
|
15 |
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
16 |
+
from sklearn.ensemble import GradientBoostingRegressor
|
17 |
+
from sklearn.tree import DecisionTreeRegressor
|
18 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
19 |
+
|
20 |
+
# the dataset I am using is from RapidApi
|
21 |
+
api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
|
22 |
+
url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
|
23 |
+
headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
|
24 |
+
response = requests.get(url, headers = headers)
|
25 |
+
if response.status_code == 200:
|
26 |
+
data = response.json()
|
27 |
+
print(data)
|
28 |
+
else:
|
29 |
+
print({response.status_code}, {response.text})
|
30 |
+
# Note climbData and descendData is not being used since there is only one key entry for both features
|
31 |
+
|
32 |
+
# Decision Tree Regressor
|
33 |
+
|
34 |
+
features = [] # taking out features
|
35 |
+
for flight in data["features"]:
|
36 |
+
properties = flight["properties"]
|
37 |
+
geometry = flight["geometry"]["coordinates"]
|
38 |
+
distance_km = float(properties["dist_km"])
|
39 |
+
cruise_time = int(properties["cruiseTime"])
|
40 |
+
fuel = float(properties["fuel"])
|
41 |
+
CO2 = float(properties["CO2"])
|
42 |
+
features.append([distance_km, cruise_time, CO2, fuel])
|
43 |
+
|
44 |
+
df = pd.DataFrame(features, columns = ["distance_km", "cruise_time", "CO2", "fuel"]) # converting to data frame
|
45 |
+
feature = df.drop("fuel", axis = 1)
|
46 |
+
target = df["fuel"]
|
47 |
+
|
48 |
+
feature_train, feature_test, target_train, target_test = train_test_split(df.drop("fuel", axis=1), df["fuel"], test_size=0.1, random_state=42)
|
49 |
+
# split into train and test
|
50 |
+
|
51 |
+
regression_tree = DecisionTreeRegressor(max_depth = 100, min_samples_leaf = 50, random_state = 42) # Can also chnage the hyperparameters
|
52 |
+
regression_tree.fit(feature_train, target_train)
|
53 |
+
target_prediction = regression_tree.predict(feature_test) # making the predictions
|
54 |
+
|
55 |
+
mse = mean_squared_error(target_test, target_prediction)
|
56 |
+
r2 = r2_score(target_test, target_prediction)
|
57 |
+
mae = mean_absolute_error(target_test, target_prediction)
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
feature_we_want = len(target) # what we are looking for
|
62 |
+
regression = 1 # there is only one predictor
|
63 |
+
residual = feature_we_want - 2
|
64 |
+
explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
|
65 |
+
unexplained_variance = mse * feature_we_want
|
66 |
+
|
67 |
+
F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
|
68 |
+
p_value = 1 - f.cdf(F_value, regression, residual)
|
69 |
+
rse = np.sqrt(mse)
|
70 |
+
|
71 |
+
|
72 |
+
print(f"mean squared e {mse}")
|
73 |
+
print(f"Rsquared {r2}")
|
74 |
+
print(f"mean absolute error {mae}")
|
75 |
+
print(f"regression: {regression:.4f}")
|
76 |
+
print(f"residual: {residual:.4f}")
|
77 |
+
print(f"p-value: {p_value:.4f}") # calculating P value for the report
|
78 |
+
print(f"standard error: {rse:.2f}")
|
79 |
+
print(f"f-statistic: {F_value:.2f}")
|
80 |
+
|
81 |
+
|
82 |
+
# Very high mse and mae
|
83 |
+
|
84 |
+
# Played with hyperparameters need to learn a bit more regarding some of them
|
85 |
+
|
86 |
+
# metrics still high this is a bad model
|
models/linear_regression/linear_regression.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""linear_regression.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
|
8 |
+
"""
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import requests
|
12 |
+
import numpy as np
|
13 |
+
from sklearn.linear_model import LinearRegression
|
14 |
+
from sklearn.model_selection import train_test_split
|
15 |
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
16 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
17 |
+
import tensorflow as tf
|
18 |
+
|
19 |
+
# the dataset I am using is from RapidApi
|
20 |
+
api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
|
21 |
+
url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
|
22 |
+
headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
|
23 |
+
response = requests.get(url, headers = headers)
|
24 |
+
if response.status_code == 200:
|
25 |
+
data = response.json()
|
26 |
+
print(data)
|
27 |
+
else:
|
28 |
+
print({response.status_code}, {response.text})
|
29 |
+
# Note climbData and descendData is not being used since there is only one key entry for both features
|
30 |
+
|
31 |
+
# Linear regression model
|
32 |
+
# Here I am using two features "fuel" and "dist_nm"
|
33 |
+
data = response.json()
|
34 |
+
fuel = []
|
35 |
+
distance = []
|
36 |
+
|
37 |
+
for segment in data['features']:
|
38 |
+
fuel.append(float(segment['properties']['fuel']))
|
39 |
+
distance.append(float(segment['properties']['dist_nm']))
|
40 |
+
|
41 |
+
# converting th np
|
42 |
+
fuel = np.array(fuel).reshape(-1, 1)
|
43 |
+
distance = np.array(distance).reshape(-1, 1)
|
44 |
+
|
45 |
+
model = LinearRegression() # passing and training the model
|
46 |
+
model.fit(distance, fuel) # fitting the model
|
47 |
+
|
48 |
+
predicted_fuel = model.predict(distance) # predicted_fuel is the predicted values
|
49 |
+
|
50 |
+
# looking at the model metrics
|
51 |
+
mse = mean_squared_error(fuel, predicted_fuel)
|
52 |
+
r2 = r2_score(fuel, predicted_fuel)
|
53 |
+
future_distance_nm = 30.90 # you can change the value of future_distance_nm
|
54 |
+
predicted_fuel_future = model.predict([[future_distance_nm]]) # you will need predicted_fuel
|
55 |
+
|
56 |
+
feature_we_want = len(fuel) # what we are looking for
|
57 |
+
regression = 1 # there is only one predictor
|
58 |
+
residual = feature_we_want - 2
|
59 |
+
explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
|
60 |
+
unexplained_variance = mse * feature_we_want
|
61 |
+
|
62 |
+
F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
|
63 |
+
p_value = 1 - f.cdf(F_value, regression, residual)
|
64 |
+
rse = np.sqrt(mse)
|
65 |
+
|
66 |
+
mean_distance = np.mean(distance)
|
67 |
+
se_coefficient = rse / np.sqrt(np.sum((distance - mean_distance)**2))
|
68 |
+
|
69 |
+
print(f"regression: {regression:.4f}")
|
70 |
+
print(f"residual: {residual:.4f}")
|
71 |
+
print(f"p-value: {p_value:.4f}") # calculating P value for the report
|
72 |
+
print(f"r^2 score: {r2:.2f}")
|
73 |
+
print(f"average fuel: {model.coef_[0][0]:.2f}") # average of fuel based on the dataset
|
74 |
+
print(f"mean squared error: {mse:.2f}")
|
75 |
+
print(f"f-statistic: {F_value:.2f}")
|
76 |
+
print(f"standard error: {rse:.2f}")
|
77 |
+
print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0][0]:.2f} kg")
|
78 |
+
|
79 |
+
# this is a more in depth of tthe Linear regression model since its giving good results
|
80 |
+
# Here I selected more important features that contribute to the total fuel needed for the flight
|
81 |
+
|
82 |
+
features = [feature['properties'] for feature in data['features']] # takking the important features
|
83 |
+
df = pd.DataFrame(features)
|
84 |
+
numeric_cols = ['dist_km', 'cruiseTime', 'fuel', 'CO2', 'dist_nm'] # Can add or take off features
|
85 |
+
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors = 'coerce')
|
86 |
+
|
87 |
+
df.rename(columns={'fuel': 'cruiseFuel'}, inplace = True)
|
88 |
+
features = df[['dist_km', 'cruiseTime', 'CO2', 'dist_nm']] # Can add or take off features
|
89 |
+
target = df['cruiseFuel']
|
90 |
+
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.1, random_state = 42) # split into train and test
|
91 |
+
|
92 |
+
model = LinearRegression() # model
|
93 |
+
model.fit(features_train, target_train) # fitting the model
|
94 |
+
target_prediction = model.predict(features_test) # making predctions
|
95 |
+
|
96 |
+
mse = mean_squared_error(target_test, target_prediction)
|
97 |
+
r2 = r2_score(target_test, target_prediction)
|
98 |
+
mae = mean_absolute_error(target_test, target_prediction)
|
99 |
+
future_distance_nm = [30.90, 40, 1894.34, 23.9] # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
|
100 |
+
predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
|
101 |
+
average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean() # can change to "dist_km" to see the average in km
|
102 |
+
|
103 |
+
feature_we_want = len(target) # what we are looking for
|
104 |
+
regression = 1 # there is only one predictor
|
105 |
+
residual = feature_we_want - 2
|
106 |
+
explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
|
107 |
+
unexplained_variance = mse * feature_we_want
|
108 |
+
|
109 |
+
F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
|
110 |
+
p_value = 1 - f.cdf(F_value, regression, residual)
|
111 |
+
rse = np.sqrt(mse)
|
112 |
+
|
113 |
+
print(f"mean squared error {mse:.2f}")
|
114 |
+
print(f"Rsquared {r2:.2f}")
|
115 |
+
print(f"mean absolute error {mae:.2f}")
|
116 |
+
print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for LR model")
|
117 |
+
print(f"regression: {regression:.4f}")
|
118 |
+
print(f"residual: {residual:.4f}")
|
119 |
+
print(f"p-value: {p_value:.4f}") # calculating P value for the report
|
120 |
+
print(f"standard error: {rse:.2f}")
|
121 |
+
print(f"f-statistic: {F_value:.2f}")
|
122 |
+
print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
|
123 |
+
# mse is 26.97 which is low this means that the model is perfoming good
|
124 |
+
# in this line mse = mean_squared_error(target_test, target_prediction) if you chnage target_test to features_test you will get same mse
|
125 |
+
# Rsquare is close to 1 this mean the model is a good fit
|
126 |
+
# mae is 3.5 this explains why some numbers are a bit different but the predicted valuesare close ot the actual ones
|
127 |
+
|
128 |
+
# the mse went down to 0.0 so this is good !! but im a bit scketchy
|
129 |
+
# r square went up to 1 so the model is a good fit
|
130 |
+
# the mae went down to 0
|
131 |
+
|
132 |
+
# this reults is for the above model
|
133 |
+
mean_cruise_fuel = df['cruiseFuel'].mean() # calculating the mean of the cruiseFuel values
|
134 |
+
mse_to_mean_ratio = mse / mean_cruise_fuel # calculating the ratio of mse to the mean cruiseFuel
|
135 |
+
mean_cruise_fuel, mse_to_mean_ratio
|
136 |
+
|
137 |
+
# the number 0.0162% means that the mse is small compared to the mean_cruise_fuel this is goog, again the predictions are
|
138 |
+
# close to the actual value
|
139 |
+
|
140 |
+
# numbers went down even more!!!
|
models/neural_network/__pycache__/inference.cpython-39.pyc
ADDED
Binary file (2.53 kB). View file
|
|
models/neural_network/inference.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import tensorflow as tf
|
5 |
+
|
6 |
+
|
7 |
+
def load_data(path):
|
8 |
+
df = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
|
9 |
+
df = df.drop(df.columns[0], axis=1) # drop the Unnamed: 0 column
|
10 |
+
return df
|
11 |
+
|
12 |
+
|
13 |
+
def load_model_and_preprocessor(model_path, preprocessor_path):
|
14 |
+
loaded_model = tf.keras.models.load_model('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_model.keras')
|
15 |
+
preprocessor = joblib.load('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_preprocessor.pkl')
|
16 |
+
return loaded_model, preprocessor
|
17 |
+
|
18 |
+
|
19 |
+
def select_features(df, selected_features):
|
20 |
+
X_test = df[selected_features]
|
21 |
+
y_test = df['fuel_burn_total']
|
22 |
+
return X_test, y_test
|
23 |
+
|
24 |
+
|
25 |
+
def preprocess_data(preprocessor, X_test):
|
26 |
+
X_test_processed = preprocessor.transform(X_test)
|
27 |
+
return X_test_processed
|
28 |
+
|
29 |
+
|
30 |
+
def predict_in_batches(loaded_model, X_test_processed, y_test, batch_size):
|
31 |
+
num_batches = X_test_processed.shape[0] // batch_size + int(X_test_processed.shape[0] % batch_size != 0)
|
32 |
+
total_accuracy = 0
|
33 |
+
|
34 |
+
for batch_num in range(num_batches):
|
35 |
+
start_index = batch_num * batch_size
|
36 |
+
end_index = min(start_index + batch_size, X_test_processed.shape[0])
|
37 |
+
batch_X = X_test_processed[start_index:end_index]
|
38 |
+
batch_y = y_test.iloc[start_index:end_index]
|
39 |
+
|
40 |
+
# Make predictions with the loaded final model
|
41 |
+
batch_predictions = loaded_model.predict(batch_X)
|
42 |
+
|
43 |
+
# Calculate accuracy for the current batch
|
44 |
+
batch_accuracy = 1 - np.mean(np.abs(batch_y.values - batch_predictions[:, 0]) / batch_y.values)
|
45 |
+
total_accuracy += batch_accuracy * (end_index - start_index)
|
46 |
+
|
47 |
+
print(f'Batch {batch_num + 1}/{num_batches} - Accuracy: {batch_accuracy:.2%}')
|
48 |
+
|
49 |
+
average_accuracy = total_accuracy / X_test_processed.shape[0]
|
50 |
+
print(f'Average Accuracy: {average_accuracy:.2%}')
|
51 |
+
|
52 |
+
|
53 |
+
def main():
|
54 |
+
df = load_data('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
|
55 |
+
|
56 |
+
loaded_model, preprocessor = load_model_and_preprocessor( '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_model.keras',
|
57 |
+
'/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_preprocessor.pkl')
|
58 |
+
|
59 |
+
selected_features = [
|
60 |
+
'Origin_Airport', 'Destination_Airport', 'Operating_Airline', 'model', '_Manufacturer',
|
61 |
+
'seats', 'distance', '_Operating_Airline_ASK_(Millions)', 'FLIGHT_ID', 'FFLOW_KGM',
|
62 |
+
'J/T', 'CAT', 'dist', 'mean_taxi_in'
|
63 |
+
]
|
64 |
+
# Select only the relevant features
|
65 |
+
X_test, y_test = select_features(df, selected_features)
|
66 |
+
|
67 |
+
X_test_processed = preprocess_data(preprocessor, X_test)
|
68 |
+
|
69 |
+
predict_in_batches(loaded_model, X_test_processed, y_test, batch_size=32)
|
70 |
+
|
71 |
+
|
72 |
+
if __name__ == "__main__":
|
73 |
+
import os
|
74 |
+
|
75 |
+
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
|
76 |
+
main()
|
models/neural_network/model.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from tensorflow.keras.models import Sequential
|
4 |
+
from tensorflow.keras.layers import Dense, Dropout, Input
|
5 |
+
from tensorflow.keras.callbacks import EarlyStopping
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
8 |
+
from sklearn.compose import ColumnTransformer
|
9 |
+
from sklearn.pipeline import Pipeline
|
10 |
+
import joblib
|
11 |
+
|
12 |
+
|
13 |
+
def load_data(file_path):
|
14 |
+
return pd.read_csv(file_path)
|
15 |
+
|
16 |
+
|
17 |
+
def preprocess_data(data, selected_features, categorical_features, numerical_features):
|
18 |
+
# Define preprocessing pipelines
|
19 |
+
numeric_transformer = Pipeline(steps=[
|
20 |
+
('scaler', StandardScaler())
|
21 |
+
])
|
22 |
+
categorical_transformer = Pipeline(steps=[
|
23 |
+
('encoder', OneHotEncoder(handle_unknown='ignore'))
|
24 |
+
])
|
25 |
+
|
26 |
+
# Combine preprocessing steps
|
27 |
+
preprocessor = ColumnTransformer(
|
28 |
+
transformers=[
|
29 |
+
('num', numeric_transformer, numerical_features),
|
30 |
+
('cat', categorical_transformer, categorical_features)
|
31 |
+
])
|
32 |
+
|
33 |
+
# Split the datasets
|
34 |
+
X = data[selected_features]
|
35 |
+
y = data['fuel_burn_total']
|
36 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
37 |
+
|
38 |
+
# Preprocess the datasets
|
39 |
+
X_train = preprocessor.fit_transform(X_train)
|
40 |
+
X_test = preprocessor.transform(X_test)
|
41 |
+
|
42 |
+
return X_train, X_test, y_train, y_test, preprocessor
|
43 |
+
|
44 |
+
|
45 |
+
def build_model(input_shape):
|
46 |
+
model = Sequential([
|
47 |
+
Input(shape=(input_shape,)),
|
48 |
+
Dense(64, activation='relu'),
|
49 |
+
Dense(64, activation='relu'),
|
50 |
+
Dense(1)
|
51 |
+
])
|
52 |
+
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
|
53 |
+
return model
|
54 |
+
|
55 |
+
|
56 |
+
def train_model(model, X_train, y_train, epochs=50, batch_size=32, patience=10, validation_split=0.2):
|
57 |
+
"""
|
58 |
+
Trains the provided model using the training data.
|
59 |
+
|
60 |
+
Parameters:
|
61 |
+
model (tensorflow.keras.Model): The model to be trained.
|
62 |
+
X_train (numpy.ndarray): The training data.
|
63 |
+
y_train (numpy.ndarray): The target values for the training data.
|
64 |
+
epochs (int, optional): The number of epochs to train the model. Default is 50.
|
65 |
+
batch_size (int, optional): The number of samples per gradient update. Default is 32.
|
66 |
+
patience (int, optional): Number of epochs with no improvement after which training will be stopped. Default is 10.
|
67 |
+
validation_split (float, optional): Fraction of the training data to be used as validation data. Default is 0.2.
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
model (tensorflow.keras.Model): The trained model.
|
71 |
+
history (tensorflow.python.keras.callbacks.History): A record of training loss values and metrics values at successive epochs.
|
72 |
+
"""
|
73 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
|
74 |
+
history = model.fit(X_train, y_train, validation_split=validation_split, epochs=epochs, callbacks=[early_stopping],
|
75 |
+
batch_size=batch_size)
|
76 |
+
|
77 |
+
return model, history
|
78 |
+
|
79 |
+
|
80 |
+
def evaluate_model(model, X_test, y_test):
|
81 |
+
y_pred = model.predict(X_test)
|
82 |
+
mae = np.mean(np.abs(y_test - y_pred.flatten()))
|
83 |
+
rmse = np.sqrt(np.mean((y_test - y_pred.flatten()) ** 2))
|
84 |
+
return mae, rmse
|
85 |
+
|
86 |
+
|
87 |
+
def save_model(model, preprocessor, model_path, preprocessor_path):
|
88 |
+
model.save(model_path)
|
89 |
+
joblib.dump(preprocessor, preprocessor_path)
|
90 |
+
|
91 |
+
|
92 |
+
def main():
|
93 |
+
data = load_data('../../datasets/preprocessed_data.csv')
|
94 |
+
selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
|
95 |
+
'CAT', 'dist']
|
96 |
+
|
97 |
+
categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'J/T', 'CAT']
|
98 |
+
numerical_features = ['seats', 'distance', 'dist']
|
99 |
+
|
100 |
+
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, selected_features, categorical_features,
|
101 |
+
numerical_features)
|
102 |
+
|
103 |
+
model = build_model(X_train.shape[1])
|
104 |
+
|
105 |
+
model, history = train_model(model, X_train, y_train)
|
106 |
+
|
107 |
+
mae, rmse = evaluate_model(model, X_test, y_test)
|
108 |
+
print(f'MAE: {mae}')
|
109 |
+
print(f'RMSE: {rmse}')
|
110 |
+
|
111 |
+
save_model(model, preprocessor, '../../saved_models/nn_model.keras', '../../saved_models/nn_preprocessor.pkl')
|
112 |
+
|
113 |
+
|
114 |
+
if __name__ == "__main__":
|
115 |
+
import os
|
116 |
+
|
117 |
+
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
|
118 |
+
main()
|
models/neural_network/neural_network.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# # -*- coding: utf-8 -*-
|
2 |
+
# """neural_network.ipynb
|
3 |
+
|
4 |
+
# Automatically generated by Colab.
|
5 |
+
|
6 |
+
# Original file is located at
|
7 |
+
# https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
|
8 |
+
# """
|
9 |
+
|
10 |
+
# import pandas as pd
|
11 |
+
# import requests
|
12 |
+
# import numpy as np
|
13 |
+
# from sklearn.model_selection import train_test_split
|
14 |
+
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
15 |
+
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
16 |
+
# from tensorflow.keras.models import Sequential
|
17 |
+
# from tensorflow.keras.layers import Dense
|
18 |
+
# from scipy.stats import f
|
19 |
+
|
20 |
+
# # Neural Network model
|
21 |
+
# # Note here I am using a new dataset which Abdulelah shared with me.
|
22 |
+
# # dataa filename "preprocessed_data.csv"
|
23 |
+
|
24 |
+
# dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
|
25 |
+
# # using dataset Abdulelah gave me
|
26 |
+
|
27 |
+
# dataset.dropna(inplace = True)
|
28 |
+
# dataset.head()
|
29 |
+
|
30 |
+
|
31 |
+
# feature = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]
|
32 |
+
# target = dataset['fuel_burn_total']
|
33 |
+
|
34 |
+
# feature = feature.copy()
|
35 |
+
# feature.drop('model', axis=1, inplace=True)
|
36 |
+
|
37 |
+
# # doing encoding
|
38 |
+
# encoder = OneHotEncoder(sparse_output = False)
|
39 |
+
# feature_encoded = pd.DataFrame(encoder.fit_transform(feature[['model']]))
|
40 |
+
# feature_encoded.columns = encoder.get_feature_names_out(['model'])
|
41 |
+
# feature.drop('model', axis = 1, inplace = True)
|
42 |
+
# feature = pd.concat([feature.reset_index(drop = True), feature_encoded.reset_index(drop = True)], axis = 1)
|
43 |
+
|
44 |
+
|
45 |
+
# feature_train, feature_test, target_train, target_test = train_test_split(feature, target, test_size = 0.1, random_state = 42) # split into train and test
|
46 |
+
# scaler = StandardScaler()
|
47 |
+
# feature_train_scaled = scaler.fit_transform(feature_train)
|
48 |
+
# feature_test_scaled = scaler.transform(feature_test)
|
49 |
+
|
50 |
+
# # building the model
|
51 |
+
# model = Sequential([
|
52 |
+
# Dense(64, activation = 'relu', input_shape = (feature_train_scaled.shape[1],)),
|
53 |
+
# Dense(64, activation = 'relu'),
|
54 |
+
# Dense(1)]) # can change dense
|
55 |
+
# model.compile(optimizer = 'adam', loss = 'mean_squared_error') # compiling model
|
56 |
+
# model.fit(feature_train_scaled, target_train, epochs = 50, batch_size = 32, verbose = 1) # training model
|
57 |
+
|
58 |
+
# mse = model.evaluate(feature_test_scaled, target_test)
|
59 |
+
# print("mean squared e", mse)
|
60 |
+
|
61 |
+
|
62 |
+
# target_prediction = model.predict(feature_test_scaled)
|
63 |
+
# r2 = r2_score(target_test, target_prediction)
|
64 |
+
# mae = mean_absolute_error(target_test, target_prediction)
|
65 |
+
# mse = mean_squared_error(target_test, target_prediction)
|
66 |
+
|
67 |
+
# feature_we_want = len(target) # what we are looking for
|
68 |
+
# regression = 1 # there is only one predictor
|
69 |
+
# residual = feature_we_want - 2
|
70 |
+
# explained_variance = r2 * np.sum((target - np.mean(target))**2)
|
71 |
+
# unexplained_variance = mse * feature_we_want
|
72 |
+
|
73 |
+
# F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
|
74 |
+
# p_value = 1 - f.cdf(F_value, regression, residual)
|
75 |
+
# rse = np.sqrt(mse)
|
76 |
+
|
77 |
+
# print(f"Rquared {r2}")
|
78 |
+
# print(f"mean absolute e {mae}")
|
79 |
+
# print(f"mean squared e {mse}")
|
80 |
+
# print(f"regression: {regression:.4f}")
|
81 |
+
# print(f"residual: {residual:.4f}")
|
82 |
+
# print(f"p-value: {p_value:.4f}") # calculating P value for the report
|
83 |
+
# print(f"standard error: {rse:.2f}")
|
84 |
+
# print(f"f-statistic: {F_value:.2f}")
|
85 |
+
# # the mse difference between the predicted and actual fuel burn totals on the model is around 4.97, it it was lower it would be better
|
86 |
+
|
87 |
+
# # mse is 0 now this is a good model !
|
88 |
+
|
89 |
+
import pandas as pd
|
90 |
+
import numpy as np
|
91 |
+
from sklearn.model_selection import train_test_split
|
92 |
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
93 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
94 |
+
from tensorflow.keras.models import Sequential
|
95 |
+
from tensorflow.keras.layers import Dense
|
96 |
+
from scipy.stats import f
|
97 |
+
|
98 |
+
# Load the dataset
|
99 |
+
dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
|
100 |
+
dataset.dropna(inplace=True)
|
101 |
+
|
102 |
+
# Features and target
|
103 |
+
features = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]
|
104 |
+
target = dataset['fuel_burn_total']
|
105 |
+
|
106 |
+
# Encoding the 'model' column
|
107 |
+
encoder = OneHotEncoder(sparse_output=False)
|
108 |
+
model_encoded = pd.DataFrame(encoder.fit_transform(features[['model']]))
|
109 |
+
model_encoded.columns = encoder.get_feature_names_out(['model'])
|
110 |
+
|
111 |
+
# Drop the original 'model' column and add the encoded data
|
112 |
+
features = features.drop('model', axis=1)
|
113 |
+
features = pd.concat([features.reset_index(drop=True), model_encoded.reset_index(drop=True)], axis=1)
|
114 |
+
|
115 |
+
# Train-test split
|
116 |
+
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)
|
117 |
+
|
118 |
+
# Feature scaling
|
119 |
+
scaler = StandardScaler()
|
120 |
+
feature_train_scaled = scaler.fit_transform(feature_train)
|
121 |
+
feature_test_scaled = scaler.transform(feature_test)
|
122 |
+
|
123 |
+
# Neural network model
|
124 |
+
model = Sequential([
|
125 |
+
Dense(64, activation='relu', input_shape=(feature_train_scaled.shape[1],)),
|
126 |
+
Dense(64, activation='relu'),
|
127 |
+
Dense(1)
|
128 |
+
])
|
129 |
+
|
130 |
+
# Compile and train the model
|
131 |
+
model.compile(optimizer='adam', loss='mean_squared_error')
|
132 |
+
model.fit(feature_train_scaled, target_train, epochs=50, batch_size=32, verbose=1)
|
133 |
+
|
134 |
+
# Evaluate the model
|
135 |
+
mse = model.evaluate(feature_test_scaled, target_test)
|
136 |
+
print("Mean Squared Error:", mse)
|
137 |
+
|
138 |
+
# Predictions and performance metrics
|
139 |
+
target_prediction = model.predict(feature_test_scaled)
|
140 |
+
r2 = r2_score(target_test, target_prediction)
|
141 |
+
mae = mean_absolute_error(target_test, target_prediction)
|
142 |
+
mse = mean_squared_error(target_test, target_prediction)
|
143 |
+
|
144 |
+
# Calculate F-statistic and p-value (for reporting purposes)
|
145 |
+
n_samples = len(target)
|
146 |
+
n_predictors = feature_train_scaled.shape[1]
|
147 |
+
residual = n_samples - n_predictors - 1
|
148 |
+
explained_variance = r2 * np.sum((target - np.mean(target))**2)
|
149 |
+
unexplained_variance = mse * n_samples
|
150 |
+
|
151 |
+
F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
|
152 |
+
p_value = 1 - f.cdf(F_value, n_predictors, residual)
|
153 |
+
rse = np.sqrt(mse)
|
154 |
+
|
155 |
+
# Print the results
|
156 |
+
print(f"R-squared: {r2}")
|
157 |
+
print(f"Mean Absolute Error: {mae}")
|
158 |
+
print(f"Mean Squared Error: {mse}")
|
159 |
+
print(f"p-value: {p_value:.4f}")
|
160 |
+
print(f"Root Squared Error: {rse:.2f}")
|
161 |
+
print(f"F-statistic: {F_value:.2f}")
|
models/neural_network/test.ipynb
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pandas as pd\n",
|
10 |
+
"import numpy as np\n",
|
11 |
+
"from sklearn.model_selection import train_test_split\n",
|
12 |
+
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
13 |
+
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
14 |
+
"from tensorflow.keras.models import Sequential\n",
|
15 |
+
"from tensorflow.keras.layers import Dense\n",
|
16 |
+
"from scipy.stats import f\n",
|
17 |
+
"\n",
|
18 |
+
"# Load the dataset\n",
|
19 |
+
"dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')\n",
|
20 |
+
"dataset.dropna(inplace=True)"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 2,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [],
|
28 |
+
"source": [
|
29 |
+
"# Features and target\n",
|
30 |
+
"features = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]\n",
|
31 |
+
"target = dataset['fuel_burn_total']\n",
|
32 |
+
"\n",
|
33 |
+
"# Encoding the 'model' column\n",
|
34 |
+
"encoder = OneHotEncoder(sparse_output=False)\n",
|
35 |
+
"model_encoded = pd.DataFrame(encoder.fit_transform(features[['model']]))\n",
|
36 |
+
"model_encoded.columns = encoder.get_feature_names_out(['model'])\n",
|
37 |
+
"\n",
|
38 |
+
"# Drop the original 'model' column and add the encoded data\n",
|
39 |
+
"features = features.drop('model', axis=1)\n",
|
40 |
+
"features = pd.concat([features.reset_index(drop=True), model_encoded.reset_index(drop=True)], axis=1)\n",
|
41 |
+
"\n",
|
42 |
+
"# Train-test split\n",
|
43 |
+
"feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)\n",
|
44 |
+
"\n",
|
45 |
+
"# Feature scaling\n",
|
46 |
+
"scaler = StandardScaler()\n",
|
47 |
+
"feature_train_scaled = scaler.fit_transform(feature_train)\n",
|
48 |
+
"feature_test_scaled = scaler.transform(feature_test)"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "code",
|
53 |
+
"execution_count": 7,
|
54 |
+
"metadata": {},
|
55 |
+
"outputs": [
|
56 |
+
{
|
57 |
+
"name": "stdout",
|
58 |
+
"output_type": "stream",
|
59 |
+
"text": [
|
60 |
+
"Epoch 1/50\n"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"name": "stderr",
|
65 |
+
"output_type": "stream",
|
66 |
+
"text": [
|
67 |
+
"/opt/anaconda3/envs/Intenv/lib/python3.9/site-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
|
68 |
+
" super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"name": "stdout",
|
73 |
+
"output_type": "stream",
|
74 |
+
"text": [
|
75 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 6ms/step - loss: 140.5811\n",
|
76 |
+
"Epoch 2/50\n",
|
77 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 1.9729\n",
|
78 |
+
"Epoch 3/50\n",
|
79 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7662\n",
|
80 |
+
"Epoch 4/50\n",
|
81 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.8330\n",
|
82 |
+
"Epoch 5/50\n",
|
83 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7197\n",
|
84 |
+
"Epoch 6/50\n",
|
85 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7294\n",
|
86 |
+
"Epoch 7/50\n",
|
87 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.6337\n",
|
88 |
+
"Epoch 8/50\n",
|
89 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.4558\n",
|
90 |
+
"Epoch 9/50\n",
|
91 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.3461\n",
|
92 |
+
"Epoch 10/50\n",
|
93 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4073\n",
|
94 |
+
"Epoch 11/50\n",
|
95 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3993\n",
|
96 |
+
"Epoch 12/50\n",
|
97 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3657\n",
|
98 |
+
"Epoch 13/50\n",
|
99 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3334\n",
|
100 |
+
"Epoch 14/50\n",
|
101 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3895\n",
|
102 |
+
"Epoch 15/50\n",
|
103 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4462\n",
|
104 |
+
"Epoch 16/50\n",
|
105 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2150\n",
|
106 |
+
"Epoch 17/50\n",
|
107 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3340\n",
|
108 |
+
"Epoch 18/50\n",
|
109 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2634\n",
|
110 |
+
"Epoch 19/50\n",
|
111 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.2737\n",
|
112 |
+
"Epoch 20/50\n",
|
113 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2614\n",
|
114 |
+
"Epoch 21/50\n",
|
115 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2445\n",
|
116 |
+
"Epoch 22/50\n",
|
117 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2159\n",
|
118 |
+
"Epoch 23/50\n",
|
119 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4048\n",
|
120 |
+
"Epoch 24/50\n",
|
121 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2998\n",
|
122 |
+
"Epoch 25/50\n",
|
123 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2747\n",
|
124 |
+
"Epoch 26/50\n",
|
125 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2207\n",
|
126 |
+
"Epoch 27/50\n",
|
127 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1944\n",
|
128 |
+
"Epoch 28/50\n",
|
129 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3801\n",
|
130 |
+
"Epoch 29/50\n",
|
131 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2268\n",
|
132 |
+
"Epoch 30/50\n",
|
133 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.2105\n",
|
134 |
+
"Epoch 31/50\n",
|
135 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1308\n",
|
136 |
+
"Epoch 32/50\n",
|
137 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1518\n",
|
138 |
+
"Epoch 33/50\n",
|
139 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1473\n",
|
140 |
+
"Epoch 34/50\n",
|
141 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2194\n",
|
142 |
+
"Epoch 35/50\n",
|
143 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1172\n",
|
144 |
+
"Epoch 36/50\n",
|
145 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1910\n",
|
146 |
+
"Epoch 37/50\n",
|
147 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1921\n",
|
148 |
+
"Epoch 38/50\n",
|
149 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2753\n",
|
150 |
+
"Epoch 39/50\n",
|
151 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2847\n",
|
152 |
+
"Epoch 40/50\n",
|
153 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1538\n",
|
154 |
+
"Epoch 41/50\n",
|
155 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1008\n",
|
156 |
+
"Epoch 42/50\n",
|
157 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1592\n",
|
158 |
+
"Epoch 43/50\n",
|
159 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0971\n",
|
160 |
+
"Epoch 44/50\n",
|
161 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1211\n",
|
162 |
+
"Epoch 45/50\n",
|
163 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1177\n",
|
164 |
+
"Epoch 46/50\n",
|
165 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0955\n",
|
166 |
+
"Epoch 47/50\n",
|
167 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0695\n",
|
168 |
+
"Epoch 48/50\n",
|
169 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2184\n",
|
170 |
+
"Epoch 49/50\n",
|
171 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1073\n",
|
172 |
+
"Epoch 50/50\n",
|
173 |
+
"\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1462\n",
|
174 |
+
"\u001b[1m146/146\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 2ms/step - loss: 0.0717\n",
|
175 |
+
"Mean Squared Error: 0.16058479249477386\n",
|
176 |
+
"\u001b[1m146/146\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 1ms/step\n"
|
177 |
+
]
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"source": [
|
181 |
+
"# Neural network model\n",
|
182 |
+
"model = Sequential([\n",
|
183 |
+
" Dense(64, activation='relu', input_shape=(feature_train_scaled.shape[1],)),\n",
|
184 |
+
" Dense(64, activation='relu'),\n",
|
185 |
+
" Dense(1)\n",
|
186 |
+
"])\n",
|
187 |
+
"\n",
|
188 |
+
"# Compile and train the model\n",
|
189 |
+
"model.compile(optimizer='adam', loss='mean_squared_error')\n",
|
190 |
+
"model.fit(feature_train_scaled, target_train, epochs=50, batch_size=32, verbose=1)\n",
|
191 |
+
"\n",
|
192 |
+
"# Evaluate the model\n",
|
193 |
+
"mse = model.evaluate(feature_test_scaled, target_test)\n",
|
194 |
+
"print(\"Mean Squared Error:\", mse)\n",
|
195 |
+
"\n",
|
196 |
+
"# Predictions and performance metrics\n",
|
197 |
+
"target_prediction = model.predict(feature_test_scaled)\n",
|
198 |
+
"r2 = r2_score(target_test, target_prediction)\n",
|
199 |
+
"mae = mean_absolute_error(target_test, target_prediction)\n",
|
200 |
+
"mse = mean_squared_error(target_test, target_prediction)"
|
201 |
+
]
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"cell_type": "code",
|
205 |
+
"execution_count": 6,
|
206 |
+
"metadata": {},
|
207 |
+
"outputs": [
|
208 |
+
{
|
209 |
+
"name": "stdout",
|
210 |
+
"output_type": "stream",
|
211 |
+
"text": [
|
212 |
+
"R-squared: 0.9780861666108605\n",
|
213 |
+
"Mean Absolute Error: 0.7006260730692777\n",
|
214 |
+
"Mean Squared Error: 2.554603752569432\n",
|
215 |
+
"p-value: 0.0000\n",
|
216 |
+
"Root Squared Error: 1.60\n",
|
217 |
+
"F-statistic: 24052.88\n"
|
218 |
+
]
|
219 |
+
}
|
220 |
+
],
|
221 |
+
"source": [
|
222 |
+
"# Calculate F-statistic and p-value \n",
|
223 |
+
"n_samples = len(target)\n",
|
224 |
+
"n_predictors = feature_train_scaled.shape[1]\n",
|
225 |
+
"residual = n_samples - n_predictors - 1\n",
|
226 |
+
"explained_variance = r2 * np.sum((target - np.mean(target))**2)\n",
|
227 |
+
"unexplained_variance = mse * n_samples\n",
|
228 |
+
"\n",
|
229 |
+
"F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)\n",
|
230 |
+
"p_value = 1 - f.cdf(F_value, n_predictors, residual)\n",
|
231 |
+
"rse = np.sqrt(mse)\n",
|
232 |
+
"\n",
|
233 |
+
"# Print the results\n",
|
234 |
+
"print(f\"R-squared: {r2}\")\n",
|
235 |
+
"print(f\"Mean Absolute Error: {mae}\")\n",
|
236 |
+
"print(f\"Mean Squared Error: {mse}\")\n",
|
237 |
+
"print(f\"p-value: {p_value:.4f}\")\n",
|
238 |
+
"print(f\"Root Squared Error: {rse:.2f}\")\n",
|
239 |
+
"print(f\"F-statistic: {F_value:.2f}\")"
|
240 |
+
]
|
241 |
+
}
|
242 |
+
],
|
243 |
+
"metadata": {
|
244 |
+
"kernelspec": {
|
245 |
+
"display_name": "Intenv",
|
246 |
+
"language": "python",
|
247 |
+
"name": "python3"
|
248 |
+
},
|
249 |
+
"language_info": {
|
250 |
+
"codemirror_mode": {
|
251 |
+
"name": "ipython",
|
252 |
+
"version": 3
|
253 |
+
},
|
254 |
+
"file_extension": ".py",
|
255 |
+
"mimetype": "text/x-python",
|
256 |
+
"name": "python",
|
257 |
+
"nbconvert_exporter": "python",
|
258 |
+
"pygments_lexer": "ipython3",
|
259 |
+
"version": "3.9.18"
|
260 |
+
}
|
261 |
+
},
|
262 |
+
"nbformat": 4,
|
263 |
+
"nbformat_minor": 2
|
264 |
+
}
|
models/xgboost/gradient_boosting_regressor.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# # -*- coding: utf-8 -*-
|
2 |
+
# """gradient_boosting_regressor.ipynb
|
3 |
+
|
4 |
+
# Automatically generated by Colab.
|
5 |
+
|
6 |
+
# Original file is located at
|
7 |
+
# https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
|
8 |
+
# """
|
9 |
+
|
10 |
+
# import pandas as pd
|
11 |
+
# import requests
|
12 |
+
# import numpy as np
|
13 |
+
# from sklearn.linear_model import LinearRegression
|
14 |
+
# from sklearn.model_selection import train_test_split
|
15 |
+
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
16 |
+
# from sklearn.ensemble import GradientBoostingRegressor
|
17 |
+
# from sklearn.tree import DecisionTreeRegressor
|
18 |
+
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
19 |
+
|
20 |
+
# # the dataset I am using is from RapidApi
|
21 |
+
# api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
|
22 |
+
# url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
|
23 |
+
# headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
|
24 |
+
# response = requests.get(url, headers = headers)
|
25 |
+
# if response.status_code == 200:
|
26 |
+
# data = response.json()
|
27 |
+
# print(data)
|
28 |
+
# else:
|
29 |
+
# print({response.status_code}, {response.text})
|
30 |
+
# # Note climbData and descendData is not being used since there is only one key entry for both features
|
31 |
+
|
32 |
+
# # Gradient Boosting Regressor
|
33 |
+
# # In here Im using the same .json dataset with a new model Gradient Boosting Regressor
|
34 |
+
|
35 |
+
# data = response.json()
|
36 |
+
# features = [feature['properties'] for feature in data['features']]
|
37 |
+
# df = pd.DataFrame(features) # extracting features for the model
|
38 |
+
|
39 |
+
# #print(df.columns)
|
40 |
+
|
41 |
+
# # numeric
|
42 |
+
# df['dist_km'] = pd.to_numeric(df['dist_km'], errors = 'coerce')
|
43 |
+
# df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors = 'coerce')
|
44 |
+
# df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors = 'coerce')
|
45 |
+
# df['fuel'] = pd.to_numeric(df['fuel'], errors = 'coerce')
|
46 |
+
# df['CO2'] = pd.to_numeric(df['CO2'], errors = 'coerce')
|
47 |
+
|
48 |
+
# df.dropna(inplace = True)
|
49 |
+
# features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']] # you can play with this and add more features I kept it simple with what I know is important
|
50 |
+
# target = df['fuel']
|
51 |
+
|
52 |
+
# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 42) # split into train and test
|
53 |
+
|
54 |
+
# model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 25, max_depth = 5, random_state = 42) # can play with the hyperparameters and observe model metrics
|
55 |
+
# model.fit(features_train, target_train) # fitting the model
|
56 |
+
# target_prediction = model.predict(features_test) # predictions
|
57 |
+
|
58 |
+
# mse = mean_squared_error(target_test, target_prediction)
|
59 |
+
# r2 = r2_score(target_test, target_prediction)
|
60 |
+
# mae = mean_absolute_error(target_test, target_prediction)
|
61 |
+
# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
|
62 |
+
|
63 |
+
|
64 |
+
# feature_we_want = len(target) # what we are looking for
|
65 |
+
# regression = 1 # there is only one predictor
|
66 |
+
# residual = feature_we_want - 2
|
67 |
+
# explained_variance = r2 * np.sum((target - np.mean(target))**2)
|
68 |
+
# unexplained_variance = mse * feature_we_want
|
69 |
+
|
70 |
+
# F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
|
71 |
+
# p_value = 1 - f.cdf(F_value, regression, residual)
|
72 |
+
# rse = np.sqrt(mse)
|
73 |
+
|
74 |
+
# future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92] # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
|
75 |
+
# predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
|
76 |
+
# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean() # can change to "dist_km" to see the average in km
|
77 |
+
|
78 |
+
# print(f"mean squared error: {mse}") # checking the model perfomance
|
79 |
+
# print(f"R-squared: {r2}")
|
80 |
+
# print(f"mean absolute error: {mae}")
|
81 |
+
# print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for XGBoost model")
|
82 |
+
# print(f"regression: {regression:.4f}")
|
83 |
+
# print(f"residual: {residual:.4f}")
|
84 |
+
# print(f"p-value: {p_value:.4f}") # calculating P value for the report
|
85 |
+
# print(f"standard error: {rse:.2f}")
|
86 |
+
# print(f"f-statistic: {F_value:.2f}")
|
87 |
+
# print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
|
88 |
+
|
89 |
+
# # seems like the mse is verrryyy highhhhhhh but this chnages if we add or take off features
|
90 |
+
# # the Rsquare and mae have same numbers as the linear resseion model so thats good
|
91 |
+
|
92 |
+
# # added more features I am now playing with the hyperparameters the metrics go up and down based of the hyperparameters
|
93 |
+
|
94 |
+
# # mse really high, this is a bad model, rquare is a negative number
|
95 |
+
|
96 |
+
# import pandas as pd
|
97 |
+
# import requests
|
98 |
+
# import numpy as np
|
99 |
+
# from sklearn.model_selection import train_test_split
|
100 |
+
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
101 |
+
# from sklearn.ensemble import GradientBoostingRegressor
|
102 |
+
# from sklearn.preprocessing import StandardScaler
|
103 |
+
|
104 |
+
# # Load data from API
|
105 |
+
# api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
|
106 |
+
# url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
|
107 |
+
# headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
|
108 |
+
# response = requests.get(url, headers=headers)
|
109 |
+
|
110 |
+
# if response.status_code == 200:
|
111 |
+
# data = response.json()
|
112 |
+
# else:
|
113 |
+
# print(f"Error {response.status_code}: {response.text}")
|
114 |
+
|
115 |
+
# # Extract features
|
116 |
+
# features = [feature['properties'] for feature in data['features']]
|
117 |
+
# df = pd.DataFrame(features)
|
118 |
+
|
119 |
+
# # Convert relevant columns to numeric
|
120 |
+
# df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
|
121 |
+
# df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
|
122 |
+
# df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
|
123 |
+
# df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
|
124 |
+
# df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
|
125 |
+
|
126 |
+
# df.dropna(inplace=True)
|
127 |
+
|
128 |
+
# # Define features and target
|
129 |
+
# features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
|
130 |
+
# target = df['fuel']
|
131 |
+
|
132 |
+
# # Split the data
|
133 |
+
# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
134 |
+
|
135 |
+
# # Gradient Boosting Regressor
|
136 |
+
# model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
|
137 |
+
# model.fit(features_train, target_train)
|
138 |
+
# target_prediction = model.predict(features_test)
|
139 |
+
|
140 |
+
# # Evaluate model performance
|
141 |
+
# mse = mean_squared_error(target_test, target_prediction)
|
142 |
+
# r2 = r2_score(target_test, target_prediction)
|
143 |
+
# mae = mean_absolute_error(target_test, target_prediction)
|
144 |
+
# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
|
145 |
+
|
146 |
+
# # Future predictions
|
147 |
+
# future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
|
148 |
+
# predicted_fuel_future = model.predict([future_distance_nm])
|
149 |
+
|
150 |
+
# # Print the results
|
151 |
+
# print(f"Mean Squared Error: {mse}")
|
152 |
+
# print(f"R-squared: {r2}")
|
153 |
+
# print(f"Mean Absolute Error: {mae}")
|
154 |
+
# print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
|
155 |
+
# print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
|
156 |
+
|
157 |
+
# # Comment on performance
|
158 |
+
# if mse > 1000: # Threshold can be adjusted
|
159 |
+
# print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
|
160 |
+
# if r2 < 0:
|
161 |
+
# print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
|
162 |
+
|
163 |
+
import pandas as pd
|
164 |
+
import requests
|
165 |
+
import numpy as np
|
166 |
+
from sklearn.model_selection import train_test_split
|
167 |
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
168 |
+
from sklearn.ensemble import GradientBoostingRegressor
|
169 |
+
from scipy.stats import f # Importing the F-distribution
|
170 |
+
|
171 |
+
# Load data from API
|
172 |
+
api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
|
173 |
+
url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
|
174 |
+
headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
|
175 |
+
response = requests.get(url, headers=headers)
|
176 |
+
|
177 |
+
if response.status_code == 200:
|
178 |
+
data = response.json()
|
179 |
+
else:
|
180 |
+
print(f"Error {response.status_code}: {response.text}")
|
181 |
+
|
182 |
+
# Extract features
|
183 |
+
features = [feature['properties'] for feature in data['features']]
|
184 |
+
df = pd.DataFrame(features)
|
185 |
+
|
186 |
+
# Convert relevant columns to numeric
|
187 |
+
df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
|
188 |
+
df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
|
189 |
+
df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
|
190 |
+
df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
|
191 |
+
df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
|
192 |
+
|
193 |
+
df.dropna(inplace=True)
|
194 |
+
|
195 |
+
# Define features and target
|
196 |
+
features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
|
197 |
+
target = df['fuel']
|
198 |
+
|
199 |
+
# Split the data
|
200 |
+
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
201 |
+
|
202 |
+
# Gradient Boosting Regressor
|
203 |
+
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
|
204 |
+
model.fit(features_train, target_train)
|
205 |
+
target_prediction = model.predict(features_test)
|
206 |
+
|
207 |
+
# Evaluate model performance
|
208 |
+
mse = mean_squared_error(target_test, target_prediction)
|
209 |
+
r2 = r2_score(target_test, target_prediction)
|
210 |
+
mae = mean_absolute_error(target_test, target_prediction)
|
211 |
+
average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
|
212 |
+
|
213 |
+
# Future predictions
|
214 |
+
future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
|
215 |
+
predicted_fuel_future = model.predict([future_distance_nm])
|
216 |
+
|
217 |
+
# Calculate F-statistic and p-value (if necessary)
|
218 |
+
n_samples = len(target)
|
219 |
+
n_predictors = features_train.shape[1]
|
220 |
+
residual = n_samples - n_predictors - 1
|
221 |
+
explained_variance = r2 * np.sum((target - np.mean(target))**2)
|
222 |
+
unexplained_variance = mse * n_samples
|
223 |
+
|
224 |
+
F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
|
225 |
+
p_value = 1 - f.cdf(F_value, n_predictors, residual)
|
226 |
+
|
227 |
+
# Print the results
|
228 |
+
print(f"Mean Squared Error: {mse}")
|
229 |
+
print(f"R-squared: {r2}")
|
230 |
+
print(f"Mean Absolute Error: {mae}")
|
231 |
+
print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
|
232 |
+
print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
|
233 |
+
print(f"p-value: {p_value:.4f}")
|
234 |
+
print(f"F-statistic: {F_value:.2f}")
|
235 |
+
|
236 |
+
# Comment on performance
|
237 |
+
if mse > 1000: # Threshold can be adjusted
|
238 |
+
print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
|
239 |
+
if r2 < 0:
|
240 |
+
print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
|
models/xgboost/inference.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import joblib
|
3 |
+
|
4 |
+
|
5 |
+
def load_data(file_path):
|
6 |
+
df = pd.read_csv(file_path)
|
7 |
+
df = df.drop(df.columns[0], axis=1) # Drop the Unnamed: 0 column (Index column)
|
8 |
+
return df
|
9 |
+
|
10 |
+
|
11 |
+
def load_model(model_path):
|
12 |
+
return joblib.load(model_path)
|
13 |
+
|
14 |
+
|
15 |
+
def evaluate_model(df, model, selected_features, batch_size=100):
|
16 |
+
total_accuracy = 0
|
17 |
+
num_rows = len(df)
|
18 |
+
|
19 |
+
for start in range(0, num_rows, batch_size):
|
20 |
+
end = min(start + batch_size, num_rows)
|
21 |
+
batch_df = df.iloc[start:end]
|
22 |
+
|
23 |
+
fuel_burn_total = batch_df.pop('fuel_burn_total').values
|
24 |
+
batch_df = batch_df[selected_features]
|
25 |
+
|
26 |
+
predictions = model.predict(batch_df)
|
27 |
+
|
28 |
+
# Calculate accuracy for the current batch
|
29 |
+
accuracy = 1 - abs(fuel_burn_total - predictions) / fuel_burn_total
|
30 |
+
batch_accuracy = accuracy.mean()
|
31 |
+
total_accuracy += batch_accuracy * len(batch_df)
|
32 |
+
|
33 |
+
print(f'Processed rows {start + 1} to {end} out of {num_rows} rows')
|
34 |
+
|
35 |
+
average_accuracy = total_accuracy / num_rows
|
36 |
+
return average_accuracy
|
37 |
+
|
38 |
+
|
39 |
+
def main():
|
40 |
+
data_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/test.csv'
|
41 |
+
model_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/xgboost_model.joblib'
|
42 |
+
|
43 |
+
selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
|
44 |
+
'CAT', 'dist']
|
45 |
+
|
46 |
+
# Load data and model
|
47 |
+
df = load_data(data_file_path)
|
48 |
+
model = load_model(model_file_path)
|
49 |
+
|
50 |
+
# Evaluate the model
|
51 |
+
average_accuracy = evaluate_model(df, model, selected_features)
|
52 |
+
|
53 |
+
# Print the average accuracy
|
54 |
+
print(f'Average Accuracy: {average_accuracy:.2%}')
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
main()
|
models/xgboost/model.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
6 |
+
from sklearn.compose import ColumnTransformer
|
7 |
+
from sklearn.pipeline import Pipeline
|
8 |
+
import xgboost as xgb
|
9 |
+
import joblib
|
10 |
+
|
11 |
+
|
12 |
+
def load_data(file_path):
|
13 |
+
data = pd.read_csv(file_path)
|
14 |
+
data = data.reset_index(drop=True)
|
15 |
+
data = data.drop(data.columns[0], axis=1) # Drop the Unnamed: 0 column
|
16 |
+
return data
|
17 |
+
|
18 |
+
|
19 |
+
def preprocess_data(data, features_to_use, categorical_features, numerical_features):
|
20 |
+
# Preprocessing pipelines for both numeric and categorical features
|
21 |
+
numeric_transformer = Pipeline(steps=[
|
22 |
+
('scaler', StandardScaler())
|
23 |
+
])
|
24 |
+
|
25 |
+
categorical_transformer = Pipeline(steps=[
|
26 |
+
('encoder', OneHotEncoder(handle_unknown='ignore'))
|
27 |
+
])
|
28 |
+
|
29 |
+
# Combine preprocessing steps
|
30 |
+
preprocessor = ColumnTransformer(
|
31 |
+
transformers=[
|
32 |
+
('num', numeric_transformer, numerical_features),
|
33 |
+
('cat', categorical_transformer, categorical_features)
|
34 |
+
])
|
35 |
+
|
36 |
+
# Split the datasets
|
37 |
+
X = data[features_to_use]
|
38 |
+
y = data['fuel_burn_total']
|
39 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
40 |
+
|
41 |
+
return X_train, X_test, y_train, y_test, preprocessor
|
42 |
+
|
43 |
+
|
44 |
+
def train_model(X_train, y_train, preprocessor, best_params):
|
45 |
+
# Create the final model pipeline using the best parameters
|
46 |
+
final_pipeline = Pipeline(steps=[
|
47 |
+
('preprocessor', preprocessor),
|
48 |
+
('model', xgb.XGBRegressor(objective='reg:squarederror',
|
49 |
+
n_estimators=best_params['n_estimators'],
|
50 |
+
max_depth=best_params['max_depth'],
|
51 |
+
learning_rate=best_params['learning_rate'],
|
52 |
+
subsample=best_params['subsample'],
|
53 |
+
random_state=42))
|
54 |
+
])
|
55 |
+
|
56 |
+
# Train the final model on the entire training datasets
|
57 |
+
final_pipeline.fit(X_train, y_train)
|
58 |
+
|
59 |
+
return final_pipeline
|
60 |
+
|
61 |
+
|
62 |
+
def evaluate_model(model, X_test, y_test):
|
63 |
+
y_pred = model.predict(X_test)
|
64 |
+
|
65 |
+
mae = mean_absolute_error(y_test, y_pred)
|
66 |
+
mse = mean_squared_error(y_test, y_pred)
|
67 |
+
rmse = np.sqrt(mse)
|
68 |
+
return mae, rmse
|
69 |
+
|
70 |
+
|
71 |
+
def save_model(model, model_path):
|
72 |
+
joblib.dump(model, model_path)
|
73 |
+
|
74 |
+
|
75 |
+
def main():
|
76 |
+
data_file_path = '../../datasets/preprocessed_data.csv'
|
77 |
+
model_file_path = '../../saved_models/xgboost_model.joblib'
|
78 |
+
|
79 |
+
features_to_use = [
|
80 |
+
'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
|
81 |
+
'seats', 'distance',
|
82 |
+
'J/T', 'CAT', 'dist'
|
83 |
+
]
|
84 |
+
|
85 |
+
# Identify categorical and numerical features
|
86 |
+
categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
|
87 |
+
'J/T', 'CAT']
|
88 |
+
numerical_features = [col for col in features_to_use if col not in categorical_features]
|
89 |
+
|
90 |
+
# Load data
|
91 |
+
data = load_data(data_file_path)
|
92 |
+
|
93 |
+
# Preprocess the data
|
94 |
+
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
|
95 |
+
numerical_features)
|
96 |
+
|
97 |
+
# best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
|
98 |
+
# These hyperparameters were determined through a process of hyperparameter tuning.
|
99 |
+
#
|
100 |
+
# - 'n_estimators': determines the number of boosting rounds or trees to build.
|
101 |
+
# - 'max_depth': Maximum tree depth for base learners.
|
102 |
+
# - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
|
103 |
+
# can improve generalization. Typical values range from 0.01 to 0.1.
|
104 |
+
# - 'subsample': controls the fraction of observations used for each tree. A smaller subsample value results in
|
105 |
+
# smaller and less complex models, which can help prevent overfitting.
|
106 |
+
best_params = {
|
107 |
+
'n_estimators': 400,
|
108 |
+
'max_depth': 20,
|
109 |
+
'learning_rate': 0.08,
|
110 |
+
'subsample': 0.9,
|
111 |
+
}
|
112 |
+
|
113 |
+
# Train the model
|
114 |
+
model = train_model(X_train, y_train, preprocessor, best_params)
|
115 |
+
|
116 |
+
# Evaluate the model
|
117 |
+
mae, rmse = evaluate_model(model, X_test, y_test)
|
118 |
+
print(f'MAE: {mae}')
|
119 |
+
print(f'RMSE: {rmse}')
|
120 |
+
# Save the final model
|
121 |
+
save_model(model, model_file_path)
|
122 |
+
|
123 |
+
|
124 |
+
if __name__ == "__main__":
|
125 |
+
main()
|