poudel commited on
Commit
f637442
1 Parent(s): 6b84851

Upload 11 files

Browse files
models/.DS_Store ADDED
Binary file (6.15 kB). View file
 
models/decision_tree_regression/decision_tree_regressor.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """decision_tree_regressor.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
8
+ """
9
+
10
+ import pandas as pd
11
+ import requests
12
+ import numpy as np
13
+ from sklearn.linear_model import LinearRegression
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
16
+ from sklearn.ensemble import GradientBoostingRegressor
17
+ from sklearn.tree import DecisionTreeRegressor
18
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
19
+
20
+ # the dataset I am using is from RapidApi
21
+ api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
22
+ url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
23
+ headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
24
+ response = requests.get(url, headers = headers)
25
+ if response.status_code == 200:
26
+ data = response.json()
27
+ print(data)
28
+ else:
29
+ print({response.status_code}, {response.text})
30
+ # Note climbData and descendData is not being used since there is only one key entry for both features
31
+
32
+ # Decision Tree Regressor
33
+
34
+ features = [] # taking out features
35
+ for flight in data["features"]:
36
+ properties = flight["properties"]
37
+ geometry = flight["geometry"]["coordinates"]
38
+ distance_km = float(properties["dist_km"])
39
+ cruise_time = int(properties["cruiseTime"])
40
+ fuel = float(properties["fuel"])
41
+ CO2 = float(properties["CO2"])
42
+ features.append([distance_km, cruise_time, CO2, fuel])
43
+
44
+ df = pd.DataFrame(features, columns = ["distance_km", "cruise_time", "CO2", "fuel"]) # converting to data frame
45
+ feature = df.drop("fuel", axis = 1)
46
+ target = df["fuel"]
47
+
48
+ feature_train, feature_test, target_train, target_test = train_test_split(df.drop("fuel", axis=1), df["fuel"], test_size=0.1, random_state=42)
49
+ # split into train and test
50
+
51
+ regression_tree = DecisionTreeRegressor(max_depth = 100, min_samples_leaf = 50, random_state = 42) # Can also chnage the hyperparameters
52
+ regression_tree.fit(feature_train, target_train)
53
+ target_prediction = regression_tree.predict(feature_test) # making the predictions
54
+
55
+ mse = mean_squared_error(target_test, target_prediction)
56
+ r2 = r2_score(target_test, target_prediction)
57
+ mae = mean_absolute_error(target_test, target_prediction)
58
+
59
+
60
+
61
+ feature_we_want = len(target) # what we are looking for
62
+ regression = 1 # there is only one predictor
63
+ residual = feature_we_want - 2
64
+ explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
65
+ unexplained_variance = mse * feature_we_want
66
+
67
+ F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
68
+ p_value = 1 - f.cdf(F_value, regression, residual)
69
+ rse = np.sqrt(mse)
70
+
71
+
72
+ print(f"mean squared e {mse}")
73
+ print(f"Rsquared {r2}")
74
+ print(f"mean absolute error {mae}")
75
+ print(f"regression: {regression:.4f}")
76
+ print(f"residual: {residual:.4f}")
77
+ print(f"p-value: {p_value:.4f}") # calculating P value for the report
78
+ print(f"standard error: {rse:.2f}")
79
+ print(f"f-statistic: {F_value:.2f}")
80
+
81
+
82
+ # Very high mse and mae
83
+
84
+ # Played with hyperparameters need to learn a bit more regarding some of them
85
+
86
+ # metrics still high this is a bad model
models/linear_regression/linear_regression.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """linear_regression.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
8
+ """
9
+
10
+ import pandas as pd
11
+ import requests
12
+ import numpy as np
13
+ from sklearn.linear_model import LinearRegression
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
16
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
17
+ import tensorflow as tf
18
+
19
+ # the dataset I am using is from RapidApi
20
+ api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
21
+ url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
22
+ headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
23
+ response = requests.get(url, headers = headers)
24
+ if response.status_code == 200:
25
+ data = response.json()
26
+ print(data)
27
+ else:
28
+ print({response.status_code}, {response.text})
29
+ # Note climbData and descendData is not being used since there is only one key entry for both features
30
+
31
+ # Linear regression model
32
+ # Here I am using two features "fuel" and "dist_nm"
33
+ data = response.json()
34
+ fuel = []
35
+ distance = []
36
+
37
+ for segment in data['features']:
38
+ fuel.append(float(segment['properties']['fuel']))
39
+ distance.append(float(segment['properties']['dist_nm']))
40
+
41
+ # converting th np
42
+ fuel = np.array(fuel).reshape(-1, 1)
43
+ distance = np.array(distance).reshape(-1, 1)
44
+
45
+ model = LinearRegression() # passing and training the model
46
+ model.fit(distance, fuel) # fitting the model
47
+
48
+ predicted_fuel = model.predict(distance) # predicted_fuel is the predicted values
49
+
50
+ # looking at the model metrics
51
+ mse = mean_squared_error(fuel, predicted_fuel)
52
+ r2 = r2_score(fuel, predicted_fuel)
53
+ future_distance_nm = 30.90 # you can change the value of future_distance_nm
54
+ predicted_fuel_future = model.predict([[future_distance_nm]]) # you will need predicted_fuel
55
+
56
+ feature_we_want = len(fuel) # what we are looking for
57
+ regression = 1 # there is only one predictor
58
+ residual = feature_we_want - 2
59
+ explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
60
+ unexplained_variance = mse * feature_we_want
61
+
62
+ F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
63
+ p_value = 1 - f.cdf(F_value, regression, residual)
64
+ rse = np.sqrt(mse)
65
+
66
+ mean_distance = np.mean(distance)
67
+ se_coefficient = rse / np.sqrt(np.sum((distance - mean_distance)**2))
68
+
69
+ print(f"regression: {regression:.4f}")
70
+ print(f"residual: {residual:.4f}")
71
+ print(f"p-value: {p_value:.4f}") # calculating P value for the report
72
+ print(f"r^2 score: {r2:.2f}")
73
+ print(f"average fuel: {model.coef_[0][0]:.2f}") # average of fuel based on the dataset
74
+ print(f"mean squared error: {mse:.2f}")
75
+ print(f"f-statistic: {F_value:.2f}")
76
+ print(f"standard error: {rse:.2f}")
77
+ print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0][0]:.2f} kg")
78
+
79
+ # this is a more in depth of tthe Linear regression model since its giving good results
80
+ # Here I selected more important features that contribute to the total fuel needed for the flight
81
+
82
+ features = [feature['properties'] for feature in data['features']] # takking the important features
83
+ df = pd.DataFrame(features)
84
+ numeric_cols = ['dist_km', 'cruiseTime', 'fuel', 'CO2', 'dist_nm'] # Can add or take off features
85
+ df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors = 'coerce')
86
+
87
+ df.rename(columns={'fuel': 'cruiseFuel'}, inplace = True)
88
+ features = df[['dist_km', 'cruiseTime', 'CO2', 'dist_nm']] # Can add or take off features
89
+ target = df['cruiseFuel']
90
+ features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.1, random_state = 42) # split into train and test
91
+
92
+ model = LinearRegression() # model
93
+ model.fit(features_train, target_train) # fitting the model
94
+ target_prediction = model.predict(features_test) # making predctions
95
+
96
+ mse = mean_squared_error(target_test, target_prediction)
97
+ r2 = r2_score(target_test, target_prediction)
98
+ mae = mean_absolute_error(target_test, target_prediction)
99
+ future_distance_nm = [30.90, 40, 1894.34, 23.9] # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
100
+ predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
101
+ average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean() # can change to "dist_km" to see the average in km
102
+
103
+ feature_we_want = len(target) # what we are looking for
104
+ regression = 1 # there is only one predictor
105
+ residual = feature_we_want - 2
106
+ explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
107
+ unexplained_variance = mse * feature_we_want
108
+
109
+ F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
110
+ p_value = 1 - f.cdf(F_value, regression, residual)
111
+ rse = np.sqrt(mse)
112
+
113
+ print(f"mean squared error {mse:.2f}")
114
+ print(f"Rsquared {r2:.2f}")
115
+ print(f"mean absolute error {mae:.2f}")
116
+ print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for LR model")
117
+ print(f"regression: {regression:.4f}")
118
+ print(f"residual: {residual:.4f}")
119
+ print(f"p-value: {p_value:.4f}") # calculating P value for the report
120
+ print(f"standard error: {rse:.2f}")
121
+ print(f"f-statistic: {F_value:.2f}")
122
+ print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
123
+ # mse is 26.97 which is low this means that the model is perfoming good
124
+ # in this line mse = mean_squared_error(target_test, target_prediction) if you chnage target_test to features_test you will get same mse
125
+ # Rsquare is close to 1 this mean the model is a good fit
126
+ # mae is 3.5 this explains why some numbers are a bit different but the predicted valuesare close ot the actual ones
127
+
128
+ # the mse went down to 0.0 so this is good !! but im a bit scketchy
129
+ # r square went up to 1 so the model is a good fit
130
+ # the mae went down to 0
131
+
132
+ # this reults is for the above model
133
+ mean_cruise_fuel = df['cruiseFuel'].mean() # calculating the mean of the cruiseFuel values
134
+ mse_to_mean_ratio = mse / mean_cruise_fuel # calculating the ratio of mse to the mean cruiseFuel
135
+ mean_cruise_fuel, mse_to_mean_ratio
136
+
137
+ # the number 0.0162% means that the mse is small compared to the mean_cruise_fuel this is goog, again the predictions are
138
+ # close to the actual value
139
+
140
+ # numbers went down even more!!!
models/neural_network/__pycache__/inference.cpython-39.pyc ADDED
Binary file (2.53 kB). View file
 
models/neural_network/inference.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import numpy as np
3
+ import pandas as pd
4
+ import tensorflow as tf
5
+
6
+
7
+ def load_data(path):
8
+ df = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
9
+ df = df.drop(df.columns[0], axis=1) # drop the Unnamed: 0 column
10
+ return df
11
+
12
+
13
+ def load_model_and_preprocessor(model_path, preprocessor_path):
14
+ loaded_model = tf.keras.models.load_model('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_model.keras')
15
+ preprocessor = joblib.load('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_preprocessor.pkl')
16
+ return loaded_model, preprocessor
17
+
18
+
19
+ def select_features(df, selected_features):
20
+ X_test = df[selected_features]
21
+ y_test = df['fuel_burn_total']
22
+ return X_test, y_test
23
+
24
+
25
+ def preprocess_data(preprocessor, X_test):
26
+ X_test_processed = preprocessor.transform(X_test)
27
+ return X_test_processed
28
+
29
+
30
+ def predict_in_batches(loaded_model, X_test_processed, y_test, batch_size):
31
+ num_batches = X_test_processed.shape[0] // batch_size + int(X_test_processed.shape[0] % batch_size != 0)
32
+ total_accuracy = 0
33
+
34
+ for batch_num in range(num_batches):
35
+ start_index = batch_num * batch_size
36
+ end_index = min(start_index + batch_size, X_test_processed.shape[0])
37
+ batch_X = X_test_processed[start_index:end_index]
38
+ batch_y = y_test.iloc[start_index:end_index]
39
+
40
+ # Make predictions with the loaded final model
41
+ batch_predictions = loaded_model.predict(batch_X)
42
+
43
+ # Calculate accuracy for the current batch
44
+ batch_accuracy = 1 - np.mean(np.abs(batch_y.values - batch_predictions[:, 0]) / batch_y.values)
45
+ total_accuracy += batch_accuracy * (end_index - start_index)
46
+
47
+ print(f'Batch {batch_num + 1}/{num_batches} - Accuracy: {batch_accuracy:.2%}')
48
+
49
+ average_accuracy = total_accuracy / X_test_processed.shape[0]
50
+ print(f'Average Accuracy: {average_accuracy:.2%}')
51
+
52
+
53
+ def main():
54
+ df = load_data('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
55
+
56
+ loaded_model, preprocessor = load_model_and_preprocessor( '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_model.keras',
57
+ '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_preprocessor.pkl')
58
+
59
+ selected_features = [
60
+ 'Origin_Airport', 'Destination_Airport', 'Operating_Airline', 'model', '_Manufacturer',
61
+ 'seats', 'distance', '_Operating_Airline_ASK_(Millions)', 'FLIGHT_ID', 'FFLOW_KGM',
62
+ 'J/T', 'CAT', 'dist', 'mean_taxi_in'
63
+ ]
64
+ # Select only the relevant features
65
+ X_test, y_test = select_features(df, selected_features)
66
+
67
+ X_test_processed = preprocess_data(preprocessor, X_test)
68
+
69
+ predict_in_batches(loaded_model, X_test_processed, y_test, batch_size=32)
70
+
71
+
72
+ if __name__ == "__main__":
73
+ import os
74
+
75
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
76
+ main()
models/neural_network/model.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from tensorflow.keras.models import Sequential
4
+ from tensorflow.keras.layers import Dense, Dropout, Input
5
+ from tensorflow.keras.callbacks import EarlyStopping
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
8
+ from sklearn.compose import ColumnTransformer
9
+ from sklearn.pipeline import Pipeline
10
+ import joblib
11
+
12
+
13
+ def load_data(file_path):
14
+ return pd.read_csv(file_path)
15
+
16
+
17
+ def preprocess_data(data, selected_features, categorical_features, numerical_features):
18
+ # Define preprocessing pipelines
19
+ numeric_transformer = Pipeline(steps=[
20
+ ('scaler', StandardScaler())
21
+ ])
22
+ categorical_transformer = Pipeline(steps=[
23
+ ('encoder', OneHotEncoder(handle_unknown='ignore'))
24
+ ])
25
+
26
+ # Combine preprocessing steps
27
+ preprocessor = ColumnTransformer(
28
+ transformers=[
29
+ ('num', numeric_transformer, numerical_features),
30
+ ('cat', categorical_transformer, categorical_features)
31
+ ])
32
+
33
+ # Split the datasets
34
+ X = data[selected_features]
35
+ y = data['fuel_burn_total']
36
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
37
+
38
+ # Preprocess the datasets
39
+ X_train = preprocessor.fit_transform(X_train)
40
+ X_test = preprocessor.transform(X_test)
41
+
42
+ return X_train, X_test, y_train, y_test, preprocessor
43
+
44
+
45
+ def build_model(input_shape):
46
+ model = Sequential([
47
+ Input(shape=(input_shape,)),
48
+ Dense(64, activation='relu'),
49
+ Dense(64, activation='relu'),
50
+ Dense(1)
51
+ ])
52
+ model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
53
+ return model
54
+
55
+
56
+ def train_model(model, X_train, y_train, epochs=50, batch_size=32, patience=10, validation_split=0.2):
57
+ """
58
+ Trains the provided model using the training data.
59
+
60
+ Parameters:
61
+ model (tensorflow.keras.Model): The model to be trained.
62
+ X_train (numpy.ndarray): The training data.
63
+ y_train (numpy.ndarray): The target values for the training data.
64
+ epochs (int, optional): The number of epochs to train the model. Default is 50.
65
+ batch_size (int, optional): The number of samples per gradient update. Default is 32.
66
+ patience (int, optional): Number of epochs with no improvement after which training will be stopped. Default is 10.
67
+ validation_split (float, optional): Fraction of the training data to be used as validation data. Default is 0.2.
68
+
69
+ Returns:
70
+ model (tensorflow.keras.Model): The trained model.
71
+ history (tensorflow.python.keras.callbacks.History): A record of training loss values and metrics values at successive epochs.
72
+ """
73
+ early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
74
+ history = model.fit(X_train, y_train, validation_split=validation_split, epochs=epochs, callbacks=[early_stopping],
75
+ batch_size=batch_size)
76
+
77
+ return model, history
78
+
79
+
80
+ def evaluate_model(model, X_test, y_test):
81
+ y_pred = model.predict(X_test)
82
+ mae = np.mean(np.abs(y_test - y_pred.flatten()))
83
+ rmse = np.sqrt(np.mean((y_test - y_pred.flatten()) ** 2))
84
+ return mae, rmse
85
+
86
+
87
+ def save_model(model, preprocessor, model_path, preprocessor_path):
88
+ model.save(model_path)
89
+ joblib.dump(preprocessor, preprocessor_path)
90
+
91
+
92
+ def main():
93
+ data = load_data('../../datasets/preprocessed_data.csv')
94
+ selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
95
+ 'CAT', 'dist']
96
+
97
+ categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'J/T', 'CAT']
98
+ numerical_features = ['seats', 'distance', 'dist']
99
+
100
+ X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, selected_features, categorical_features,
101
+ numerical_features)
102
+
103
+ model = build_model(X_train.shape[1])
104
+
105
+ model, history = train_model(model, X_train, y_train)
106
+
107
+ mae, rmse = evaluate_model(model, X_test, y_test)
108
+ print(f'MAE: {mae}')
109
+ print(f'RMSE: {rmse}')
110
+
111
+ save_model(model, preprocessor, '../../saved_models/nn_model.keras', '../../saved_models/nn_preprocessor.pkl')
112
+
113
+
114
+ if __name__ == "__main__":
115
+ import os
116
+
117
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
118
+ main()
models/neural_network/neural_network.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # -*- coding: utf-8 -*-
2
+ # """neural_network.ipynb
3
+
4
+ # Automatically generated by Colab.
5
+
6
+ # Original file is located at
7
+ # https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
8
+ # """
9
+
10
+ # import pandas as pd
11
+ # import requests
12
+ # import numpy as np
13
+ # from sklearn.model_selection import train_test_split
14
+ # from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
15
+ # from sklearn.preprocessing import StandardScaler, OneHotEncoder
16
+ # from tensorflow.keras.models import Sequential
17
+ # from tensorflow.keras.layers import Dense
18
+ # from scipy.stats import f
19
+
20
+ # # Neural Network model
21
+ # # Note here I am using a new dataset which Abdulelah shared with me.
22
+ # # dataa filename "preprocessed_data.csv"
23
+
24
+ # dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
25
+ # # using dataset Abdulelah gave me
26
+
27
+ # dataset.dropna(inplace = True)
28
+ # dataset.head()
29
+
30
+
31
+ # feature = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]
32
+ # target = dataset['fuel_burn_total']
33
+
34
+ # feature = feature.copy()
35
+ # feature.drop('model', axis=1, inplace=True)
36
+
37
+ # # doing encoding
38
+ # encoder = OneHotEncoder(sparse_output = False)
39
+ # feature_encoded = pd.DataFrame(encoder.fit_transform(feature[['model']]))
40
+ # feature_encoded.columns = encoder.get_feature_names_out(['model'])
41
+ # feature.drop('model', axis = 1, inplace = True)
42
+ # feature = pd.concat([feature.reset_index(drop = True), feature_encoded.reset_index(drop = True)], axis = 1)
43
+
44
+
45
+ # feature_train, feature_test, target_train, target_test = train_test_split(feature, target, test_size = 0.1, random_state = 42) # split into train and test
46
+ # scaler = StandardScaler()
47
+ # feature_train_scaled = scaler.fit_transform(feature_train)
48
+ # feature_test_scaled = scaler.transform(feature_test)
49
+
50
+ # # building the model
51
+ # model = Sequential([
52
+ # Dense(64, activation = 'relu', input_shape = (feature_train_scaled.shape[1],)),
53
+ # Dense(64, activation = 'relu'),
54
+ # Dense(1)]) # can change dense
55
+ # model.compile(optimizer = 'adam', loss = 'mean_squared_error') # compiling model
56
+ # model.fit(feature_train_scaled, target_train, epochs = 50, batch_size = 32, verbose = 1) # training model
57
+
58
+ # mse = model.evaluate(feature_test_scaled, target_test)
59
+ # print("mean squared e", mse)
60
+
61
+
62
+ # target_prediction = model.predict(feature_test_scaled)
63
+ # r2 = r2_score(target_test, target_prediction)
64
+ # mae = mean_absolute_error(target_test, target_prediction)
65
+ # mse = mean_squared_error(target_test, target_prediction)
66
+
67
+ # feature_we_want = len(target) # what we are looking for
68
+ # regression = 1 # there is only one predictor
69
+ # residual = feature_we_want - 2
70
+ # explained_variance = r2 * np.sum((target - np.mean(target))**2)
71
+ # unexplained_variance = mse * feature_we_want
72
+
73
+ # F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
74
+ # p_value = 1 - f.cdf(F_value, regression, residual)
75
+ # rse = np.sqrt(mse)
76
+
77
+ # print(f"Rquared {r2}")
78
+ # print(f"mean absolute e {mae}")
79
+ # print(f"mean squared e {mse}")
80
+ # print(f"regression: {regression:.4f}")
81
+ # print(f"residual: {residual:.4f}")
82
+ # print(f"p-value: {p_value:.4f}") # calculating P value for the report
83
+ # print(f"standard error: {rse:.2f}")
84
+ # print(f"f-statistic: {F_value:.2f}")
85
+ # # the mse difference between the predicted and actual fuel burn totals on the model is around 4.97, it it was lower it would be better
86
+
87
+ # # mse is 0 now this is a good model !
88
+
89
+ import pandas as pd
90
+ import numpy as np
91
+ from sklearn.model_selection import train_test_split
92
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
93
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
94
+ from tensorflow.keras.models import Sequential
95
+ from tensorflow.keras.layers import Dense
96
+ from scipy.stats import f
97
+
98
+ # Load the dataset
99
+ dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
100
+ dataset.dropna(inplace=True)
101
+
102
+ # Features and target
103
+ features = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]
104
+ target = dataset['fuel_burn_total']
105
+
106
+ # Encoding the 'model' column
107
+ encoder = OneHotEncoder(sparse_output=False)
108
+ model_encoded = pd.DataFrame(encoder.fit_transform(features[['model']]))
109
+ model_encoded.columns = encoder.get_feature_names_out(['model'])
110
+
111
+ # Drop the original 'model' column and add the encoded data
112
+ features = features.drop('model', axis=1)
113
+ features = pd.concat([features.reset_index(drop=True), model_encoded.reset_index(drop=True)], axis=1)
114
+
115
+ # Train-test split
116
+ feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)
117
+
118
+ # Feature scaling
119
+ scaler = StandardScaler()
120
+ feature_train_scaled = scaler.fit_transform(feature_train)
121
+ feature_test_scaled = scaler.transform(feature_test)
122
+
123
+ # Neural network model
124
+ model = Sequential([
125
+ Dense(64, activation='relu', input_shape=(feature_train_scaled.shape[1],)),
126
+ Dense(64, activation='relu'),
127
+ Dense(1)
128
+ ])
129
+
130
+ # Compile and train the model
131
+ model.compile(optimizer='adam', loss='mean_squared_error')
132
+ model.fit(feature_train_scaled, target_train, epochs=50, batch_size=32, verbose=1)
133
+
134
+ # Evaluate the model
135
+ mse = model.evaluate(feature_test_scaled, target_test)
136
+ print("Mean Squared Error:", mse)
137
+
138
+ # Predictions and performance metrics
139
+ target_prediction = model.predict(feature_test_scaled)
140
+ r2 = r2_score(target_test, target_prediction)
141
+ mae = mean_absolute_error(target_test, target_prediction)
142
+ mse = mean_squared_error(target_test, target_prediction)
143
+
144
+ # Calculate F-statistic and p-value (for reporting purposes)
145
+ n_samples = len(target)
146
+ n_predictors = feature_train_scaled.shape[1]
147
+ residual = n_samples - n_predictors - 1
148
+ explained_variance = r2 * np.sum((target - np.mean(target))**2)
149
+ unexplained_variance = mse * n_samples
150
+
151
+ F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
152
+ p_value = 1 - f.cdf(F_value, n_predictors, residual)
153
+ rse = np.sqrt(mse)
154
+
155
+ # Print the results
156
+ print(f"R-squared: {r2}")
157
+ print(f"Mean Absolute Error: {mae}")
158
+ print(f"Mean Squared Error: {mse}")
159
+ print(f"p-value: {p_value:.4f}")
160
+ print(f"Root Squared Error: {rse:.2f}")
161
+ print(f"F-statistic: {F_value:.2f}")
models/neural_network/test.ipynb ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np\n",
11
+ "from sklearn.model_selection import train_test_split\n",
12
+ "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
13
+ "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
14
+ "from tensorflow.keras.models import Sequential\n",
15
+ "from tensorflow.keras.layers import Dense\n",
16
+ "from scipy.stats import f\n",
17
+ "\n",
18
+ "# Load the dataset\n",
19
+ "dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')\n",
20
+ "dataset.dropna(inplace=True)"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "# Features and target\n",
30
+ "features = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]\n",
31
+ "target = dataset['fuel_burn_total']\n",
32
+ "\n",
33
+ "# Encoding the 'model' column\n",
34
+ "encoder = OneHotEncoder(sparse_output=False)\n",
35
+ "model_encoded = pd.DataFrame(encoder.fit_transform(features[['model']]))\n",
36
+ "model_encoded.columns = encoder.get_feature_names_out(['model'])\n",
37
+ "\n",
38
+ "# Drop the original 'model' column and add the encoded data\n",
39
+ "features = features.drop('model', axis=1)\n",
40
+ "features = pd.concat([features.reset_index(drop=True), model_encoded.reset_index(drop=True)], axis=1)\n",
41
+ "\n",
42
+ "# Train-test split\n",
43
+ "feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)\n",
44
+ "\n",
45
+ "# Feature scaling\n",
46
+ "scaler = StandardScaler()\n",
47
+ "feature_train_scaled = scaler.fit_transform(feature_train)\n",
48
+ "feature_test_scaled = scaler.transform(feature_test)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 7,
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "name": "stdout",
58
+ "output_type": "stream",
59
+ "text": [
60
+ "Epoch 1/50\n"
61
+ ]
62
+ },
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "/opt/anaconda3/envs/Intenv/lib/python3.9/site-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
68
+ " super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n"
69
+ ]
70
+ },
71
+ {
72
+ "name": "stdout",
73
+ "output_type": "stream",
74
+ "text": [
75
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 6ms/step - loss: 140.5811\n",
76
+ "Epoch 2/50\n",
77
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 1.9729\n",
78
+ "Epoch 3/50\n",
79
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7662\n",
80
+ "Epoch 4/50\n",
81
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.8330\n",
82
+ "Epoch 5/50\n",
83
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7197\n",
84
+ "Epoch 6/50\n",
85
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7294\n",
86
+ "Epoch 7/50\n",
87
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.6337\n",
88
+ "Epoch 8/50\n",
89
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.4558\n",
90
+ "Epoch 9/50\n",
91
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.3461\n",
92
+ "Epoch 10/50\n",
93
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4073\n",
94
+ "Epoch 11/50\n",
95
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3993\n",
96
+ "Epoch 12/50\n",
97
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3657\n",
98
+ "Epoch 13/50\n",
99
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3334\n",
100
+ "Epoch 14/50\n",
101
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3895\n",
102
+ "Epoch 15/50\n",
103
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4462\n",
104
+ "Epoch 16/50\n",
105
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2150\n",
106
+ "Epoch 17/50\n",
107
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3340\n",
108
+ "Epoch 18/50\n",
109
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2634\n",
110
+ "Epoch 19/50\n",
111
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.2737\n",
112
+ "Epoch 20/50\n",
113
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2614\n",
114
+ "Epoch 21/50\n",
115
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2445\n",
116
+ "Epoch 22/50\n",
117
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2159\n",
118
+ "Epoch 23/50\n",
119
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4048\n",
120
+ "Epoch 24/50\n",
121
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2998\n",
122
+ "Epoch 25/50\n",
123
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2747\n",
124
+ "Epoch 26/50\n",
125
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2207\n",
126
+ "Epoch 27/50\n",
127
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1944\n",
128
+ "Epoch 28/50\n",
129
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3801\n",
130
+ "Epoch 29/50\n",
131
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2268\n",
132
+ "Epoch 30/50\n",
133
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.2105\n",
134
+ "Epoch 31/50\n",
135
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1308\n",
136
+ "Epoch 32/50\n",
137
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1518\n",
138
+ "Epoch 33/50\n",
139
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1473\n",
140
+ "Epoch 34/50\n",
141
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2194\n",
142
+ "Epoch 35/50\n",
143
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1172\n",
144
+ "Epoch 36/50\n",
145
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1910\n",
146
+ "Epoch 37/50\n",
147
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1921\n",
148
+ "Epoch 38/50\n",
149
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2753\n",
150
+ "Epoch 39/50\n",
151
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2847\n",
152
+ "Epoch 40/50\n",
153
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1538\n",
154
+ "Epoch 41/50\n",
155
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1008\n",
156
+ "Epoch 42/50\n",
157
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1592\n",
158
+ "Epoch 43/50\n",
159
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0971\n",
160
+ "Epoch 44/50\n",
161
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1211\n",
162
+ "Epoch 45/50\n",
163
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1177\n",
164
+ "Epoch 46/50\n",
165
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0955\n",
166
+ "Epoch 47/50\n",
167
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0695\n",
168
+ "Epoch 48/50\n",
169
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2184\n",
170
+ "Epoch 49/50\n",
171
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1073\n",
172
+ "Epoch 50/50\n",
173
+ "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1462\n",
174
+ "\u001b[1m146/146\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 2ms/step - loss: 0.0717\n",
175
+ "Mean Squared Error: 0.16058479249477386\n",
176
+ "\u001b[1m146/146\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 1ms/step\n"
177
+ ]
178
+ }
179
+ ],
180
+ "source": [
181
+ "# Neural network model\n",
182
+ "model = Sequential([\n",
183
+ " Dense(64, activation='relu', input_shape=(feature_train_scaled.shape[1],)),\n",
184
+ " Dense(64, activation='relu'),\n",
185
+ " Dense(1)\n",
186
+ "])\n",
187
+ "\n",
188
+ "# Compile and train the model\n",
189
+ "model.compile(optimizer='adam', loss='mean_squared_error')\n",
190
+ "model.fit(feature_train_scaled, target_train, epochs=50, batch_size=32, verbose=1)\n",
191
+ "\n",
192
+ "# Evaluate the model\n",
193
+ "mse = model.evaluate(feature_test_scaled, target_test)\n",
194
+ "print(\"Mean Squared Error:\", mse)\n",
195
+ "\n",
196
+ "# Predictions and performance metrics\n",
197
+ "target_prediction = model.predict(feature_test_scaled)\n",
198
+ "r2 = r2_score(target_test, target_prediction)\n",
199
+ "mae = mean_absolute_error(target_test, target_prediction)\n",
200
+ "mse = mean_squared_error(target_test, target_prediction)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 6,
206
+ "metadata": {},
207
+ "outputs": [
208
+ {
209
+ "name": "stdout",
210
+ "output_type": "stream",
211
+ "text": [
212
+ "R-squared: 0.9780861666108605\n",
213
+ "Mean Absolute Error: 0.7006260730692777\n",
214
+ "Mean Squared Error: 2.554603752569432\n",
215
+ "p-value: 0.0000\n",
216
+ "Root Squared Error: 1.60\n",
217
+ "F-statistic: 24052.88\n"
218
+ ]
219
+ }
220
+ ],
221
+ "source": [
222
+ "# Calculate F-statistic and p-value \n",
223
+ "n_samples = len(target)\n",
224
+ "n_predictors = feature_train_scaled.shape[1]\n",
225
+ "residual = n_samples - n_predictors - 1\n",
226
+ "explained_variance = r2 * np.sum((target - np.mean(target))**2)\n",
227
+ "unexplained_variance = mse * n_samples\n",
228
+ "\n",
229
+ "F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)\n",
230
+ "p_value = 1 - f.cdf(F_value, n_predictors, residual)\n",
231
+ "rse = np.sqrt(mse)\n",
232
+ "\n",
233
+ "# Print the results\n",
234
+ "print(f\"R-squared: {r2}\")\n",
235
+ "print(f\"Mean Absolute Error: {mae}\")\n",
236
+ "print(f\"Mean Squared Error: {mse}\")\n",
237
+ "print(f\"p-value: {p_value:.4f}\")\n",
238
+ "print(f\"Root Squared Error: {rse:.2f}\")\n",
239
+ "print(f\"F-statistic: {F_value:.2f}\")"
240
+ ]
241
+ }
242
+ ],
243
+ "metadata": {
244
+ "kernelspec": {
245
+ "display_name": "Intenv",
246
+ "language": "python",
247
+ "name": "python3"
248
+ },
249
+ "language_info": {
250
+ "codemirror_mode": {
251
+ "name": "ipython",
252
+ "version": 3
253
+ },
254
+ "file_extension": ".py",
255
+ "mimetype": "text/x-python",
256
+ "name": "python",
257
+ "nbconvert_exporter": "python",
258
+ "pygments_lexer": "ipython3",
259
+ "version": "3.9.18"
260
+ }
261
+ },
262
+ "nbformat": 4,
263
+ "nbformat_minor": 2
264
+ }
models/xgboost/gradient_boosting_regressor.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # -*- coding: utf-8 -*-
2
+ # """gradient_boosting_regressor.ipynb
3
+
4
+ # Automatically generated by Colab.
5
+
6
+ # Original file is located at
7
+ # https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
8
+ # """
9
+
10
+ # import pandas as pd
11
+ # import requests
12
+ # import numpy as np
13
+ # from sklearn.linear_model import LinearRegression
14
+ # from sklearn.model_selection import train_test_split
15
+ # from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
16
+ # from sklearn.ensemble import GradientBoostingRegressor
17
+ # from sklearn.tree import DecisionTreeRegressor
18
+ # from sklearn.preprocessing import StandardScaler, OneHotEncoder
19
+
20
+ # # the dataset I am using is from RapidApi
21
+ # api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
22
+ # url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
23
+ # headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
24
+ # response = requests.get(url, headers = headers)
25
+ # if response.status_code == 200:
26
+ # data = response.json()
27
+ # print(data)
28
+ # else:
29
+ # print({response.status_code}, {response.text})
30
+ # # Note climbData and descendData is not being used since there is only one key entry for both features
31
+
32
+ # # Gradient Boosting Regressor
33
+ # # In here Im using the same .json dataset with a new model Gradient Boosting Regressor
34
+
35
+ # data = response.json()
36
+ # features = [feature['properties'] for feature in data['features']]
37
+ # df = pd.DataFrame(features) # extracting features for the model
38
+
39
+ # #print(df.columns)
40
+
41
+ # # numeric
42
+ # df['dist_km'] = pd.to_numeric(df['dist_km'], errors = 'coerce')
43
+ # df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors = 'coerce')
44
+ # df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors = 'coerce')
45
+ # df['fuel'] = pd.to_numeric(df['fuel'], errors = 'coerce')
46
+ # df['CO2'] = pd.to_numeric(df['CO2'], errors = 'coerce')
47
+
48
+ # df.dropna(inplace = True)
49
+ # features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']] # you can play with this and add more features I kept it simple with what I know is important
50
+ # target = df['fuel']
51
+
52
+ # features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 42) # split into train and test
53
+
54
+ # model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 25, max_depth = 5, random_state = 42) # can play with the hyperparameters and observe model metrics
55
+ # model.fit(features_train, target_train) # fitting the model
56
+ # target_prediction = model.predict(features_test) # predictions
57
+
58
+ # mse = mean_squared_error(target_test, target_prediction)
59
+ # r2 = r2_score(target_test, target_prediction)
60
+ # mae = mean_absolute_error(target_test, target_prediction)
61
+ # average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
62
+
63
+
64
+ # feature_we_want = len(target) # what we are looking for
65
+ # regression = 1 # there is only one predictor
66
+ # residual = feature_we_want - 2
67
+ # explained_variance = r2 * np.sum((target - np.mean(target))**2)
68
+ # unexplained_variance = mse * feature_we_want
69
+
70
+ # F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
71
+ # p_value = 1 - f.cdf(F_value, regression, residual)
72
+ # rse = np.sqrt(mse)
73
+
74
+ # future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92] # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
75
+ # predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
76
+ # average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean() # can change to "dist_km" to see the average in km
77
+
78
+ # print(f"mean squared error: {mse}") # checking the model perfomance
79
+ # print(f"R-squared: {r2}")
80
+ # print(f"mean absolute error: {mae}")
81
+ # print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for XGBoost model")
82
+ # print(f"regression: {regression:.4f}")
83
+ # print(f"residual: {residual:.4f}")
84
+ # print(f"p-value: {p_value:.4f}") # calculating P value for the report
85
+ # print(f"standard error: {rse:.2f}")
86
+ # print(f"f-statistic: {F_value:.2f}")
87
+ # print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
88
+
89
+ # # seems like the mse is verrryyy highhhhhhh but this chnages if we add or take off features
90
+ # # the Rsquare and mae have same numbers as the linear resseion model so thats good
91
+
92
+ # # added more features I am now playing with the hyperparameters the metrics go up and down based of the hyperparameters
93
+
94
+ # # mse really high, this is a bad model, rquare is a negative number
95
+
96
+ # import pandas as pd
97
+ # import requests
98
+ # import numpy as np
99
+ # from sklearn.model_selection import train_test_split
100
+ # from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
101
+ # from sklearn.ensemble import GradientBoostingRegressor
102
+ # from sklearn.preprocessing import StandardScaler
103
+
104
+ # # Load data from API
105
+ # api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
106
+ # url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
107
+ # headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
108
+ # response = requests.get(url, headers=headers)
109
+
110
+ # if response.status_code == 200:
111
+ # data = response.json()
112
+ # else:
113
+ # print(f"Error {response.status_code}: {response.text}")
114
+
115
+ # # Extract features
116
+ # features = [feature['properties'] for feature in data['features']]
117
+ # df = pd.DataFrame(features)
118
+
119
+ # # Convert relevant columns to numeric
120
+ # df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
121
+ # df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
122
+ # df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
123
+ # df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
124
+ # df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
125
+
126
+ # df.dropna(inplace=True)
127
+
128
+ # # Define features and target
129
+ # features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
130
+ # target = df['fuel']
131
+
132
+ # # Split the data
133
+ # features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
134
+
135
+ # # Gradient Boosting Regressor
136
+ # model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
137
+ # model.fit(features_train, target_train)
138
+ # target_prediction = model.predict(features_test)
139
+
140
+ # # Evaluate model performance
141
+ # mse = mean_squared_error(target_test, target_prediction)
142
+ # r2 = r2_score(target_test, target_prediction)
143
+ # mae = mean_absolute_error(target_test, target_prediction)
144
+ # average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
145
+
146
+ # # Future predictions
147
+ # future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
148
+ # predicted_fuel_future = model.predict([future_distance_nm])
149
+
150
+ # # Print the results
151
+ # print(f"Mean Squared Error: {mse}")
152
+ # print(f"R-squared: {r2}")
153
+ # print(f"Mean Absolute Error: {mae}")
154
+ # print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
155
+ # print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
156
+
157
+ # # Comment on performance
158
+ # if mse > 1000: # Threshold can be adjusted
159
+ # print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
160
+ # if r2 < 0:
161
+ # print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
162
+
163
+ import pandas as pd
164
+ import requests
165
+ import numpy as np
166
+ from sklearn.model_selection import train_test_split
167
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
168
+ from sklearn.ensemble import GradientBoostingRegressor
169
+ from scipy.stats import f # Importing the F-distribution
170
+
171
+ # Load data from API
172
+ api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
173
+ url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
174
+ headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
175
+ response = requests.get(url, headers=headers)
176
+
177
+ if response.status_code == 200:
178
+ data = response.json()
179
+ else:
180
+ print(f"Error {response.status_code}: {response.text}")
181
+
182
+ # Extract features
183
+ features = [feature['properties'] for feature in data['features']]
184
+ df = pd.DataFrame(features)
185
+
186
+ # Convert relevant columns to numeric
187
+ df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
188
+ df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
189
+ df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
190
+ df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
191
+ df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
192
+
193
+ df.dropna(inplace=True)
194
+
195
+ # Define features and target
196
+ features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
197
+ target = df['fuel']
198
+
199
+ # Split the data
200
+ features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
201
+
202
+ # Gradient Boosting Regressor
203
+ model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
204
+ model.fit(features_train, target_train)
205
+ target_prediction = model.predict(features_test)
206
+
207
+ # Evaluate model performance
208
+ mse = mean_squared_error(target_test, target_prediction)
209
+ r2 = r2_score(target_test, target_prediction)
210
+ mae = mean_absolute_error(target_test, target_prediction)
211
+ average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
212
+
213
+ # Future predictions
214
+ future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
215
+ predicted_fuel_future = model.predict([future_distance_nm])
216
+
217
+ # Calculate F-statistic and p-value (if necessary)
218
+ n_samples = len(target)
219
+ n_predictors = features_train.shape[1]
220
+ residual = n_samples - n_predictors - 1
221
+ explained_variance = r2 * np.sum((target - np.mean(target))**2)
222
+ unexplained_variance = mse * n_samples
223
+
224
+ F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
225
+ p_value = 1 - f.cdf(F_value, n_predictors, residual)
226
+
227
+ # Print the results
228
+ print(f"Mean Squared Error: {mse}")
229
+ print(f"R-squared: {r2}")
230
+ print(f"Mean Absolute Error: {mae}")
231
+ print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
232
+ print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
233
+ print(f"p-value: {p_value:.4f}")
234
+ print(f"F-statistic: {F_value:.2f}")
235
+
236
+ # Comment on performance
237
+ if mse > 1000: # Threshold can be adjusted
238
+ print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
239
+ if r2 < 0:
240
+ print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
models/xgboost/inference.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import joblib
3
+
4
+
5
+ def load_data(file_path):
6
+ df = pd.read_csv(file_path)
7
+ df = df.drop(df.columns[0], axis=1) # Drop the Unnamed: 0 column (Index column)
8
+ return df
9
+
10
+
11
+ def load_model(model_path):
12
+ return joblib.load(model_path)
13
+
14
+
15
+ def evaluate_model(df, model, selected_features, batch_size=100):
16
+ total_accuracy = 0
17
+ num_rows = len(df)
18
+
19
+ for start in range(0, num_rows, batch_size):
20
+ end = min(start + batch_size, num_rows)
21
+ batch_df = df.iloc[start:end]
22
+
23
+ fuel_burn_total = batch_df.pop('fuel_burn_total').values
24
+ batch_df = batch_df[selected_features]
25
+
26
+ predictions = model.predict(batch_df)
27
+
28
+ # Calculate accuracy for the current batch
29
+ accuracy = 1 - abs(fuel_burn_total - predictions) / fuel_burn_total
30
+ batch_accuracy = accuracy.mean()
31
+ total_accuracy += batch_accuracy * len(batch_df)
32
+
33
+ print(f'Processed rows {start + 1} to {end} out of {num_rows} rows')
34
+
35
+ average_accuracy = total_accuracy / num_rows
36
+ return average_accuracy
37
+
38
+
39
+ def main():
40
+ data_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/test.csv'
41
+ model_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/xgboost_model.joblib'
42
+
43
+ selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
44
+ 'CAT', 'dist']
45
+
46
+ # Load data and model
47
+ df = load_data(data_file_path)
48
+ model = load_model(model_file_path)
49
+
50
+ # Evaluate the model
51
+ average_accuracy = evaluate_model(df, model, selected_features)
52
+
53
+ # Print the average accuracy
54
+ print(f'Average Accuracy: {average_accuracy:.2%}')
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
models/xgboost/model.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
6
+ from sklearn.compose import ColumnTransformer
7
+ from sklearn.pipeline import Pipeline
8
+ import xgboost as xgb
9
+ import joblib
10
+
11
+
12
+ def load_data(file_path):
13
+ data = pd.read_csv(file_path)
14
+ data = data.reset_index(drop=True)
15
+ data = data.drop(data.columns[0], axis=1) # Drop the Unnamed: 0 column
16
+ return data
17
+
18
+
19
+ def preprocess_data(data, features_to_use, categorical_features, numerical_features):
20
+ # Preprocessing pipelines for both numeric and categorical features
21
+ numeric_transformer = Pipeline(steps=[
22
+ ('scaler', StandardScaler())
23
+ ])
24
+
25
+ categorical_transformer = Pipeline(steps=[
26
+ ('encoder', OneHotEncoder(handle_unknown='ignore'))
27
+ ])
28
+
29
+ # Combine preprocessing steps
30
+ preprocessor = ColumnTransformer(
31
+ transformers=[
32
+ ('num', numeric_transformer, numerical_features),
33
+ ('cat', categorical_transformer, categorical_features)
34
+ ])
35
+
36
+ # Split the datasets
37
+ X = data[features_to_use]
38
+ y = data['fuel_burn_total']
39
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
40
+
41
+ return X_train, X_test, y_train, y_test, preprocessor
42
+
43
+
44
+ def train_model(X_train, y_train, preprocessor, best_params):
45
+ # Create the final model pipeline using the best parameters
46
+ final_pipeline = Pipeline(steps=[
47
+ ('preprocessor', preprocessor),
48
+ ('model', xgb.XGBRegressor(objective='reg:squarederror',
49
+ n_estimators=best_params['n_estimators'],
50
+ max_depth=best_params['max_depth'],
51
+ learning_rate=best_params['learning_rate'],
52
+ subsample=best_params['subsample'],
53
+ random_state=42))
54
+ ])
55
+
56
+ # Train the final model on the entire training datasets
57
+ final_pipeline.fit(X_train, y_train)
58
+
59
+ return final_pipeline
60
+
61
+
62
+ def evaluate_model(model, X_test, y_test):
63
+ y_pred = model.predict(X_test)
64
+
65
+ mae = mean_absolute_error(y_test, y_pred)
66
+ mse = mean_squared_error(y_test, y_pred)
67
+ rmse = np.sqrt(mse)
68
+ return mae, rmse
69
+
70
+
71
+ def save_model(model, model_path):
72
+ joblib.dump(model, model_path)
73
+
74
+
75
+ def main():
76
+ data_file_path = '../../datasets/preprocessed_data.csv'
77
+ model_file_path = '../../saved_models/xgboost_model.joblib'
78
+
79
+ features_to_use = [
80
+ 'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
81
+ 'seats', 'distance',
82
+ 'J/T', 'CAT', 'dist'
83
+ ]
84
+
85
+ # Identify categorical and numerical features
86
+ categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
87
+ 'J/T', 'CAT']
88
+ numerical_features = [col for col in features_to_use if col not in categorical_features]
89
+
90
+ # Load data
91
+ data = load_data(data_file_path)
92
+
93
+ # Preprocess the data
94
+ X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
95
+ numerical_features)
96
+
97
+ # best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
98
+ # These hyperparameters were determined through a process of hyperparameter tuning.
99
+ #
100
+ # - 'n_estimators': determines the number of boosting rounds or trees to build.
101
+ # - 'max_depth': Maximum tree depth for base learners.
102
+ # - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
103
+ # can improve generalization. Typical values range from 0.01 to 0.1.
104
+ # - 'subsample': controls the fraction of observations used for each tree. A smaller subsample value results in
105
+ # smaller and less complex models, which can help prevent overfitting.
106
+ best_params = {
107
+ 'n_estimators': 400,
108
+ 'max_depth': 20,
109
+ 'learning_rate': 0.08,
110
+ 'subsample': 0.9,
111
+ }
112
+
113
+ # Train the model
114
+ model = train_model(X_train, y_train, preprocessor, best_params)
115
+
116
+ # Evaluate the model
117
+ mae, rmse = evaluate_model(model, X_test, y_test)
118
+ print(f'MAE: {mae}')
119
+ print(f'RMSE: {rmse}')
120
+ # Save the final model
121
+ save_model(model, model_file_path)
122
+
123
+
124
+ if __name__ == "__main__":
125
+ main()