Spaces:

poudel
/

AircraftFuelPredictorV2

Runtime error

App Files Files Community

poudel commited on 27 days ago

Commit

f637442

•

1 Parent(s): 6b84851

Upload 11 files

Browse files

Files changed (11) hide show

models/.DS_Store +0 -0
models/decision_tree_regression/decision_tree_regressor.py +86 -0
models/linear_regression/linear_regression.py +140 -0
models/neural_network/__pycache__/inference.cpython-39.pyc +0 -0
models/neural_network/inference.py +76 -0
models/neural_network/model.py +118 -0
models/neural_network/neural_network.py +161 -0
models/neural_network/test.ipynb +264 -0
models/xgboost/gradient_boosting_regressor.py +240 -0
models/xgboost/inference.py +58 -0
models/xgboost/model.py +125 -0

models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

models/decision_tree_regression/decision_tree_regressor.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# -*- coding: utf-8 -*-
+"""decision_tree_regressor.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
+"""
+import pandas as pd
+import requests
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+# the dataset I am using is from RapidApi
+api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
+url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
+headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
+response = requests.get(url, headers = headers)
+if response.status_code == 200:
+    data = response.json()
+    print(data)
+else:
+    print({response.status_code}, {response.text})
+# Note climbData and descendData is not being used since there is only one key entry for both features
+# Decision Tree Regressor
+features = []  # taking out features
+for flight in data["features"]:
+    properties = flight["properties"]
+    geometry = flight["geometry"]["coordinates"]
+    distance_km = float(properties["dist_km"])
+    cruise_time = int(properties["cruiseTime"])
+    fuel = float(properties["fuel"])
+    CO2 = float(properties["CO2"])
+    features.append([distance_km, cruise_time, CO2, fuel])
+df = pd.DataFrame(features, columns = ["distance_km", "cruise_time", "CO2", "fuel"])  # converting to data frame
+feature = df.drop("fuel", axis = 1)
+target = df["fuel"]
+feature_train, feature_test, target_train, target_test = train_test_split(df.drop("fuel", axis=1), df["fuel"], test_size=0.1, random_state=42)
+    # split into train and test
+regression_tree = DecisionTreeRegressor(max_depth = 100, min_samples_leaf = 50, random_state = 42)   # Can also chnage the hyperparameters
+regression_tree.fit(feature_train, target_train)
+target_prediction = regression_tree.predict(feature_test)   # making the predictions
+mse = mean_squared_error(target_test, target_prediction)
+r2 = r2_score(target_test, target_prediction)
+mae = mean_absolute_error(target_test, target_prediction)
+feature_we_want = len(target) # what we are looking for
+regression = 1  # there is only one predictor
+residual = feature_we_want - 2
+explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
+unexplained_variance = mse * feature_we_want
+F_value = (explained_variance / regression) / (unexplained_variance / residual)  # calculating the F statistic for the report purposes
+p_value = 1 - f.cdf(F_value, regression, residual)
+rse = np.sqrt(mse)
+print(f"mean squared e {mse}")
+print(f"Rsquared {r2}")
+print(f"mean absolute error {mae}")
+print(f"regression: {regression:.4f}")
+print(f"residual: {residual:.4f}")
+print(f"p-value: {p_value:.4f}")   # calculating P value for the report
+print(f"standard error: {rse:.2f}")
+print(f"f-statistic: {F_value:.2f}")
+# Very high mse and mae
+# Played with hyperparameters need to learn a bit more regarding some of them
+# metrics still high this is a bad model

models/linear_regression/linear_regression.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# -*- coding: utf-8 -*-
+"""linear_regression.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
+"""
+import pandas as pd
+import requests
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+import tensorflow as tf
+# the dataset I am using is from RapidApi
+api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
+url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
+headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
+response = requests.get(url, headers = headers)
+if response.status_code == 200:
+    data = response.json()
+    print(data)
+else:
+    print({response.status_code}, {response.text})
+# Note climbData and descendData is not being used since there is only one key entry for both features
+# Linear regression model
+# Here I am using two features "fuel" and "dist_nm"
+data = response.json()
+fuel = []
+distance = []
+for segment in data['features']:
+    fuel.append(float(segment['properties']['fuel']))
+    distance.append(float(segment['properties']['dist_nm']))
+# converting th np
+fuel = np.array(fuel).reshape(-1, 1)
+distance = np.array(distance).reshape(-1, 1)
+model = LinearRegression()  # passing and training the model
+model.fit(distance, fuel)  # fitting the model
+predicted_fuel = model.predict(distance)    # predicted_fuel is the predicted values
+# looking at the model metrics
+mse = mean_squared_error(fuel, predicted_fuel)
+r2 = r2_score(fuel, predicted_fuel)
+future_distance_nm = 30.90  # you can change the value of future_distance_nm
+predicted_fuel_future = model.predict([[future_distance_nm]]) # you will need predicted_fuel
+feature_we_want = len(fuel) # what we are looking for
+regression = 1  # there is only one predictor
+residual = feature_we_want - 2
+explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
+unexplained_variance = mse * feature_we_want
+F_value = (explained_variance / regression) / (unexplained_variance / residual)  # calculating the F statistic for the report purposes
+p_value = 1 - f.cdf(F_value, regression, residual)
+rse = np.sqrt(mse)
+mean_distance = np.mean(distance)
+se_coefficient = rse / np.sqrt(np.sum((distance - mean_distance)**2))
+print(f"regression: {regression:.4f}")
+print(f"residual: {residual:.4f}")
+print(f"p-value: {p_value:.4f}")   # calculating P value for the report
+print(f"r^2 score: {r2:.2f}")
+print(f"average fuel: {model.coef_[0][0]:.2f}")  # average of fuel based on the dataset
+print(f"mean squared error: {mse:.2f}")
+print(f"f-statistic: {F_value:.2f}")
+print(f"standard error: {rse:.2f}")
+print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0][0]:.2f} kg")
+# this is a more in depth of tthe Linear regression model since its giving good results
+# Here I selected more important features that contribute to the total fuel needed for the flight
+features = [feature['properties'] for feature in data['features']]  # takking the important features
+df = pd.DataFrame(features)
+numeric_cols = ['dist_km', 'cruiseTime', 'fuel', 'CO2', 'dist_nm']  # Can add or take off features
+df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors = 'coerce')
+df.rename(columns={'fuel': 'cruiseFuel'}, inplace = True)
+features = df[['dist_km', 'cruiseTime', 'CO2', 'dist_nm']]  # Can add or take off features
+target = df['cruiseFuel']
+features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.1, random_state = 42)   # split into train and test
+model = LinearRegression()    # model
+model.fit(features_train, target_train)  # fitting the model
+target_prediction = model.predict(features_test)   # making predctions
+mse = mean_squared_error(target_test, target_prediction)
+r2 = r2_score(target_test, target_prediction)
+mae = mean_absolute_error(target_test, target_prediction)
+future_distance_nm = [30.90, 40, 1894.34, 23.9]  # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
+predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
+average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()     # can change to "dist_km" to see the average in km
+feature_we_want = len(target) # what we are looking for
+regression = 1  # there is only one predictor
+residual = feature_we_want - 2
+explained_variance = r2 * np.sum((fuel - np.mean(fuel))**2)
+unexplained_variance = mse * feature_we_want
+F_value = (explained_variance / regression) / (unexplained_variance / residual)  # calculating the F statistic for the report purposes
+p_value = 1 - f.cdf(F_value, regression, residual)
+rse = np.sqrt(mse)
+print(f"mean squared error {mse:.2f}")
+print(f"Rsquared {r2:.2f}")
+print(f"mean absolute error {mae:.2f}")
+print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for LR model")
+print(f"regression: {regression:.4f}")
+print(f"residual: {residual:.4f}")
+print(f"p-value: {p_value:.4f}")   # calculating P value for the report
+print(f"standard error: {rse:.2f}")
+print(f"f-statistic: {F_value:.2f}")
+print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
+# mse is 26.97 which is low this means that the model is perfoming good
+# in this line mse = mean_squared_error(target_test, target_prediction) if you chnage target_test to features_test you will get same mse
+# Rsquare is close to 1 this mean the model is a good fit
+# mae is 3.5 this explains why some numbers are a bit different but the predicted valuesare close ot the actual ones
+# the mse went down to 0.0 so this is good !! but im a bit scketchy
+# r square went up to 1 so the model is a good fit
+# the mae went down to 0
+# this reults is for the above model
+mean_cruise_fuel = df['cruiseFuel'].mean()  # calculating the mean of the cruiseFuel values
+mse_to_mean_ratio = mse / mean_cruise_fuel  # calculating the ratio of mse to the mean cruiseFuel
+mean_cruise_fuel, mse_to_mean_ratio
+# the number 0.0162% means that the mse is small compared to the mean_cruise_fuel this is goog, again the predictions are
+# close to the actual value
+# numbers went down even more!!!

models/neural_network/__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (2.53 kB). View file

models/neural_network/inference.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import joblib
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+def load_data(path):
+    df = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
+    df = df.drop(df.columns[0], axis=1)  # drop the Unnamed: 0 column
+    return df
+def load_model_and_preprocessor(model_path, preprocessor_path):
+    loaded_model = tf.keras.models.load_model('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_model.keras')
+    preprocessor = joblib.load('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_preprocessor.pkl')
+    return loaded_model, preprocessor
+def select_features(df, selected_features):
+    X_test = df[selected_features]
+    y_test = df['fuel_burn_total']
+    return X_test, y_test
+def preprocess_data(preprocessor, X_test):
+    X_test_processed = preprocessor.transform(X_test)
+    return X_test_processed
+def predict_in_batches(loaded_model, X_test_processed, y_test, batch_size):
+    num_batches = X_test_processed.shape[0] // batch_size + int(X_test_processed.shape[0] % batch_size != 0)
+    total_accuracy = 0
+    for batch_num in range(num_batches):
+        start_index = batch_num * batch_size
+        end_index = min(start_index + batch_size, X_test_processed.shape[0])
+        batch_X = X_test_processed[start_index:end_index]
+        batch_y = y_test.iloc[start_index:end_index]
+        # Make predictions with the loaded final model
+        batch_predictions = loaded_model.predict(batch_X)
+        # Calculate accuracy for the current batch
+        batch_accuracy = 1 - np.mean(np.abs(batch_y.values - batch_predictions[:, 0]) / batch_y.values)
+        total_accuracy += batch_accuracy * (end_index - start_index)
+        print(f'Batch {batch_num + 1}/{num_batches} - Accuracy: {batch_accuracy:.2%}')
+    average_accuracy = total_accuracy / X_test_processed.shape[0]
+    print(f'Average Accuracy: {average_accuracy:.2%}')
+def main():
+    df = load_data('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
+    loaded_model, preprocessor = load_model_and_preprocessor( '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_model.keras',
+        '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/nn_preprocessor.pkl')
+    selected_features = [
+        'Origin_Airport', 'Destination_Airport', 'Operating_Airline', 'model', '_Manufacturer',
+        'seats', 'distance', '_Operating_Airline_ASK_(Millions)', 'FLIGHT_ID', 'FFLOW_KGM',
+        'J/T', 'CAT', 'dist', 'mean_taxi_in'
+    ]
+    # Select only the relevant features
+    X_test, y_test = select_features(df, selected_features)
+    X_test_processed = preprocess_data(preprocessor, X_test)
+    predict_in_batches(loaded_model, X_test_processed, y_test, batch_size=32)
+if __name__ == "__main__":
+    import os
+    os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+    main()

models/neural_network/model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import pandas as pd
+import numpy as np
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Dropout, Input
+from tensorflow.keras.callbacks import EarlyStopping
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+import joblib
+def load_data(file_path):
+    return pd.read_csv(file_path)
+def preprocess_data(data, selected_features, categorical_features, numerical_features):
+    # Define preprocessing pipelines
+    numeric_transformer = Pipeline(steps=[
+        ('scaler', StandardScaler())
+    ])
+    categorical_transformer = Pipeline(steps=[
+        ('encoder', OneHotEncoder(handle_unknown='ignore'))
+    ])
+    # Combine preprocessing steps
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, numerical_features),
+            ('cat', categorical_transformer, categorical_features)
+        ])
+    # Split the datasets
+    X = data[selected_features]
+    y = data['fuel_burn_total']
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    # Preprocess the datasets
+    X_train = preprocessor.fit_transform(X_train)
+    X_test = preprocessor.transform(X_test)
+    return X_train, X_test, y_train, y_test, preprocessor
+def build_model(input_shape):
+    model = Sequential([
+        Input(shape=(input_shape,)),
+        Dense(64, activation='relu'),
+        Dense(64, activation='relu'),
+        Dense(1)
+    ])
+    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
+    return model
+def train_model(model, X_train, y_train, epochs=50, batch_size=32, patience=10, validation_split=0.2):
+    """
+        Trains the provided model using the training data.
+        Parameters:
+        model (tensorflow.keras.Model): The model to be trained.
+        X_train (numpy.ndarray): The training data.
+        y_train (numpy.ndarray): The target values for the training data.
+        epochs (int, optional): The number of epochs to train the model. Default is 50.
+        batch_size (int, optional): The number of samples per gradient update. Default is 32.
+        patience (int, optional): Number of epochs with no improvement after which training will be stopped. Default is 10.
+        validation_split (float, optional): Fraction of the training data to be used as validation data. Default is 0.2.
+        Returns:
+        model (tensorflow.keras.Model): The trained model.
+        history (tensorflow.python.keras.callbacks.History): A record of training loss values and metrics values at successive epochs.
+        """
+    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
+    history = model.fit(X_train, y_train, validation_split=validation_split, epochs=epochs, callbacks=[early_stopping],
+                        batch_size=batch_size)
+    return model, history
+def evaluate_model(model, X_test, y_test):
+    y_pred = model.predict(X_test)
+    mae = np.mean(np.abs(y_test - y_pred.flatten()))
+    rmse = np.sqrt(np.mean((y_test - y_pred.flatten()) ** 2))
+    return mae, rmse
+def save_model(model, preprocessor, model_path, preprocessor_path):
+    model.save(model_path)
+    joblib.dump(preprocessor, preprocessor_path)
+def main():
+    data = load_data('../../datasets/preprocessed_data.csv')
+    selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
+                         'CAT', 'dist']
+    categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'J/T', 'CAT']
+    numerical_features = ['seats', 'distance', 'dist']
+    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, selected_features, categorical_features,
+                                                                     numerical_features)
+    model = build_model(X_train.shape[1])
+    model, history = train_model(model, X_train, y_train)
+    mae, rmse = evaluate_model(model, X_test, y_test)
+    print(f'MAE: {mae}')
+    print(f'RMSE: {rmse}')
+    save_model(model, preprocessor, '../../saved_models/nn_model.keras', '../../saved_models/nn_preprocessor.pkl')
+if __name__ == "__main__":
+    import os
+    os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+    main()

models/neural_network/neural_network.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# # -*- coding: utf-8 -*-
+# """neural_network.ipynb
+# Automatically generated by Colab.
+# Original file is located at
+#     https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
+# """
+# import pandas as pd
+# import requests
+# import numpy as np
+# from sklearn.model_selection import train_test_split
+# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+# from sklearn.preprocessing import StandardScaler, OneHotEncoder
+# from tensorflow.keras.models import Sequential
+# from tensorflow.keras.layers import Dense
+# from scipy.stats import f
+# # Neural Network model
+# # Note here I am using a new dataset which Abdulelah shared with me.
+# # dataa filename "preprocessed_data.csv"
+# dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
+#                           # using dataset Abdulelah gave me
+# dataset.dropna(inplace = True)
+# dataset.head()
+# feature = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]
+# target = dataset['fuel_burn_total']
+# feature = feature.copy()
+# feature.drop('model', axis=1, inplace=True)
+# # doing encoding
+# encoder = OneHotEncoder(sparse_output = False)
+# feature_encoded = pd.DataFrame(encoder.fit_transform(feature[['model']]))
+# feature_encoded.columns = encoder.get_feature_names_out(['model'])
+# feature.drop('model', axis = 1, inplace = True)
+# feature = pd.concat([feature.reset_index(drop = True), feature_encoded.reset_index(drop = True)], axis = 1)
+# feature_train, feature_test, target_train, target_test = train_test_split(feature, target, test_size = 0.1, random_state = 42)   # split into train and test
+# scaler = StandardScaler()
+# feature_train_scaled = scaler.fit_transform(feature_train)
+# feature_test_scaled = scaler.transform(feature_test)
+# # building the model
+# model = Sequential([
+#     Dense(64, activation = 'relu', input_shape = (feature_train_scaled.shape[1],)),
+#     Dense(64, activation = 'relu'),
+#     Dense(1)])  # can change dense
+# model.compile(optimizer = 'adam', loss = 'mean_squared_error')   # compiling model
+# model.fit(feature_train_scaled, target_train, epochs = 50, batch_size = 32, verbose = 1)   # training model
+# mse = model.evaluate(feature_test_scaled, target_test)
+# print("mean squared e", mse)
+# target_prediction = model.predict(feature_test_scaled)
+# r2 = r2_score(target_test, target_prediction)
+# mae = mean_absolute_error(target_test, target_prediction)
+# mse = mean_squared_error(target_test, target_prediction)
+# feature_we_want = len(target) # what we are looking for
+# regression = 1  # there is only one predictor
+# residual = feature_we_want - 2
+# explained_variance = r2 * np.sum((target - np.mean(target))**2)
+# unexplained_variance = mse * feature_we_want
+# F_value = (explained_variance / regression) / (unexplained_variance / residual)  # calculating the F statistic for the report purposes
+# p_value = 1 - f.cdf(F_value, regression, residual)
+# rse = np.sqrt(mse)
+# print(f"Rquared {r2}")
+# print(f"mean absolute e {mae}")
+# print(f"mean squared e {mse}")
+# print(f"regression: {regression:.4f}")
+# print(f"residual: {residual:.4f}")
+# print(f"p-value: {p_value:.4f}")   # calculating P value for the report
+# print(f"standard error: {rse:.2f}")
+# print(f"f-statistic: {F_value:.2f}")
+# # the mse difference between the predicted and actual fuel burn totals on the model is around 4.97, it it was lower it would be better
+# # mse is 0 now this is a good model !
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense
+from scipy.stats import f
+# Load the dataset
+dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')
+dataset.dropna(inplace=True)
+# Features and target
+features = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]
+target = dataset['fuel_burn_total']
+# Encoding the 'model' column
+encoder = OneHotEncoder(sparse_output=False)
+model_encoded = pd.DataFrame(encoder.fit_transform(features[['model']]))
+model_encoded.columns = encoder.get_feature_names_out(['model'])
+# Drop the original 'model' column and add the encoded data
+features = features.drop('model', axis=1)
+features = pd.concat([features.reset_index(drop=True), model_encoded.reset_index(drop=True)], axis=1)
+# Train-test split
+feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)
+# Feature scaling
+scaler = StandardScaler()
+feature_train_scaled = scaler.fit_transform(feature_train)
+feature_test_scaled = scaler.transform(feature_test)
+# Neural network model
+model = Sequential([
+    Dense(64, activation='relu', input_shape=(feature_train_scaled.shape[1],)),
+    Dense(64, activation='relu'),
+    Dense(1)
+])
+# Compile and train the model
+model.compile(optimizer='adam', loss='mean_squared_error')
+model.fit(feature_train_scaled, target_train, epochs=50, batch_size=32, verbose=1)
+# Evaluate the model
+mse = model.evaluate(feature_test_scaled, target_test)
+print("Mean Squared Error:", mse)
+# Predictions and performance metrics
+target_prediction = model.predict(feature_test_scaled)
+r2 = r2_score(target_test, target_prediction)
+mae = mean_absolute_error(target_test, target_prediction)
+mse = mean_squared_error(target_test, target_prediction)
+# Calculate F-statistic and p-value (for reporting purposes)
+n_samples = len(target)
+n_predictors = feature_train_scaled.shape[1]
+residual = n_samples - n_predictors - 1
+explained_variance = r2 * np.sum((target - np.mean(target))**2)
+unexplained_variance = mse * n_samples
+F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
+p_value = 1 - f.cdf(F_value, n_predictors, residual)
+rse = np.sqrt(mse)
+# Print the results
+print(f"R-squared: {r2}")
+print(f"Mean Absolute Error: {mae}")
+print(f"Mean Squared Error: {mse}")
+print(f"p-value: {p_value:.4f}")
+print(f"Root Squared Error: {rse:.2f}")
+print(f"F-statistic: {F_value:.2f}")

models/neural_network/test.ipynb ADDED Viewed

	@@ -0,0 +1,264 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
+    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Dense\n",
+    "from scipy.stats import f\n",
+    "\n",
+    "# Load the dataset\n",
+    "dataset = pd.read_csv('/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/preprocessed_data.csv')\n",
+    "dataset.dropna(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Features and target\n",
+    "features = dataset[['distance', 'model', 'seats', 'fuel_burn', 'fuel_burn_total']]\n",
+    "target = dataset['fuel_burn_total']\n",
+    "\n",
+    "# Encoding the 'model' column\n",
+    "encoder = OneHotEncoder(sparse_output=False)\n",
+    "model_encoded = pd.DataFrame(encoder.fit_transform(features[['model']]))\n",
+    "model_encoded.columns = encoder.get_feature_names_out(['model'])\n",
+    "\n",
+    "# Drop the original 'model' column and add the encoded data\n",
+    "features = features.drop('model', axis=1)\n",
+    "features = pd.concat([features.reset_index(drop=True), model_encoded.reset_index(drop=True)], axis=1)\n",
+    "\n",
+    "# Train-test split\n",
+    "feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)\n",
+    "\n",
+    "# Feature scaling\n",
+    "scaler = StandardScaler()\n",
+    "feature_train_scaled = scaler.fit_transform(feature_train)\n",
+    "feature_test_scaled = scaler.transform(feature_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/50\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/Intenv/lib/python3.9/site-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
+      "  super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 6ms/step - loss: 140.5811\n",
+      "Epoch 2/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 1.9729\n",
+      "Epoch 3/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7662\n",
+      "Epoch 4/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.8330\n",
+      "Epoch 5/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7197\n",
+      "Epoch 6/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.7294\n",
+      "Epoch 7/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.6337\n",
+      "Epoch 8/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.4558\n",
+      "Epoch 9/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.3461\n",
+      "Epoch 10/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4073\n",
+      "Epoch 11/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3993\n",
+      "Epoch 12/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3657\n",
+      "Epoch 13/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3334\n",
+      "Epoch 14/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3895\n",
+      "Epoch 15/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4462\n",
+      "Epoch 16/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2150\n",
+      "Epoch 17/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3340\n",
+      "Epoch 18/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2634\n",
+      "Epoch 19/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 7ms/step - loss: 0.2737\n",
+      "Epoch 20/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2614\n",
+      "Epoch 21/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2445\n",
+      "Epoch 22/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2159\n",
+      "Epoch 23/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.4048\n",
+      "Epoch 24/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2998\n",
+      "Epoch 25/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2747\n",
+      "Epoch 26/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2207\n",
+      "Epoch 27/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1944\n",
+      "Epoch 28/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.3801\n",
+      "Epoch 29/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2268\n",
+      "Epoch 30/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.2105\n",
+      "Epoch 31/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1308\n",
+      "Epoch 32/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1518\n",
+      "Epoch 33/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1473\n",
+      "Epoch 34/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2194\n",
+      "Epoch 35/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 6ms/step - loss: 0.1172\n",
+      "Epoch 36/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1910\n",
+      "Epoch 37/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1921\n",
+      "Epoch 38/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2753\n",
+      "Epoch 39/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2847\n",
+      "Epoch 40/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1538\n",
+      "Epoch 41/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1008\n",
+      "Epoch 42/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1592\n",
+      "Epoch 43/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0971\n",
+      "Epoch 44/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1211\n",
+      "Epoch 45/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1177\n",
+      "Epoch 46/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0955\n",
+      "Epoch 47/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.0695\n",
+      "Epoch 48/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.2184\n",
+      "Epoch 49/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1073\n",
+      "Epoch 50/50\n",
+      "\u001b[1m1314/1314\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 0.1462\n",
+      "\u001b[1m146/146\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 2ms/step - loss: 0.0717\n",
+      "Mean Squared Error: 0.16058479249477386\n",
+      "\u001b[1m146/146\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 1ms/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Neural network model\n",
+    "model = Sequential([\n",
+    "    Dense(64, activation='relu', input_shape=(feature_train_scaled.shape[1],)),\n",
+    "    Dense(64, activation='relu'),\n",
+    "    Dense(1)\n",
+    "])\n",
+    "\n",
+    "# Compile and train the model\n",
+    "model.compile(optimizer='adam', loss='mean_squared_error')\n",
+    "model.fit(feature_train_scaled, target_train, epochs=50, batch_size=32, verbose=1)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "mse = model.evaluate(feature_test_scaled, target_test)\n",
+    "print(\"Mean Squared Error:\", mse)\n",
+    "\n",
+    "# Predictions and performance metrics\n",
+    "target_prediction = model.predict(feature_test_scaled)\n",
+    "r2 = r2_score(target_test, target_prediction)\n",
+    "mae = mean_absolute_error(target_test, target_prediction)\n",
+    "mse = mean_squared_error(target_test, target_prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "R-squared: 0.9780861666108605\n",
+      "Mean Absolute Error: 0.7006260730692777\n",
+      "Mean Squared Error: 2.554603752569432\n",
+      "p-value: 0.0000\n",
+      "Root Squared Error: 1.60\n",
+      "F-statistic: 24052.88\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate F-statistic and p-value \n",
+    "n_samples = len(target)\n",
+    "n_predictors = feature_train_scaled.shape[1]\n",
+    "residual = n_samples - n_predictors - 1\n",
+    "explained_variance = r2 * np.sum((target - np.mean(target))**2)\n",
+    "unexplained_variance = mse * n_samples\n",
+    "\n",
+    "F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)\n",
+    "p_value = 1 - f.cdf(F_value, n_predictors, residual)\n",
+    "rse = np.sqrt(mse)\n",
+    "\n",
+    "# Print the results\n",
+    "print(f\"R-squared: {r2}\")\n",
+    "print(f\"Mean Absolute Error: {mae}\")\n",
+    "print(f\"Mean Squared Error: {mse}\")\n",
+    "print(f\"p-value: {p_value:.4f}\")\n",
+    "print(f\"Root Squared Error: {rse:.2f}\")\n",
+    "print(f\"F-statistic: {F_value:.2f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Intenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

models/xgboost/gradient_boosting_regressor.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# # -*- coding: utf-8 -*-
+# """gradient_boosting_regressor.ipynb
+# Automatically generated by Colab.
+# Original file is located at
+#     https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
+# """
+# import pandas as pd
+# import requests
+# import numpy as np
+# from sklearn.linear_model import LinearRegression
+# from sklearn.model_selection import train_test_split
+# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+# from sklearn.ensemble import GradientBoostingRegressor
+# from sklearn.tree import DecisionTreeRegressor
+# from sklearn.preprocessing import StandardScaler, OneHotEncoder
+# # the dataset I am using is from RapidApi
+# api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
+# url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
+# headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
+# response = requests.get(url, headers = headers)
+# if response.status_code == 200:
+#     data = response.json()
+#     print(data)
+# else:
+#     print({response.status_code}, {response.text})
+# # Note climbData and descendData is not being used since there is only one key entry for both features
+# # Gradient Boosting Regressor
+# # In here Im using the same .json dataset with a new model Gradient Boosting Regressor
+# data = response.json()
+# features = [feature['properties'] for feature in data['features']]
+# df = pd.DataFrame(features)    # extracting features for the model
+# #print(df.columns)
+# # numeric
+# df['dist_km'] = pd.to_numeric(df['dist_km'], errors = 'coerce')
+# df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors = 'coerce')
+# df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors = 'coerce')
+# df['fuel'] = pd.to_numeric(df['fuel'], errors = 'coerce')
+# df['CO2'] = pd.to_numeric(df['CO2'], errors = 'coerce')
+# df.dropna(inplace = True)
+# features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]  # you can play with this and add more features I kept it simple with what I know is important
+# target = df['fuel']
+# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 42)  # split into train and test
+# model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 25, max_depth = 5, random_state = 42)   # can play with the hyperparameters and observe model metrics
+# model.fit(features_train, target_train)     # fitting the model
+# target_prediction = model.predict(features_test)   # predictions
+# mse = mean_squared_error(target_test, target_prediction)
+# r2 = r2_score(target_test, target_prediction)
+# mae = mean_absolute_error(target_test, target_prediction)
+# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
+# feature_we_want = len(target) # what we are looking for
+# regression = 1  # there is only one predictor
+# residual = feature_we_want - 2
+# explained_variance = r2 * np.sum((target - np.mean(target))**2)
+# unexplained_variance = mse * feature_we_want
+# F_value = (explained_variance / regression) / (unexplained_variance / residual)  # calculating the F statistic for the report purposes
+# p_value = 1 - f.cdf(F_value, regression, residual)
+# rse = np.sqrt(mse)
+# future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]  # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
+# predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
+# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()     # can change to "dist_km" to see the average in km
+# print(f"mean squared error: {mse}")   # checking the model perfomance
+# print(f"R-squared: {r2}")
+# print(f"mean absolute error: {mae}")
+# print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for XGBoost model")
+# print(f"regression: {regression:.4f}")
+# print(f"residual: {residual:.4f}")
+# print(f"p-value: {p_value:.4f}")   # calculating P value for the report
+# print(f"standard error: {rse:.2f}")
+# print(f"f-statistic: {F_value:.2f}")
+# print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
+# # seems like the mse is verrryyy highhhhhhh but this chnages if we add or take off features
+# # the Rsquare and mae have same numbers as the linear resseion model so thats good
+# # added more features I am now playing with the hyperparameters the metrics go up and down based of the hyperparameters
+# # mse really high, this is a bad model, rquare is a negative number
+# import pandas as pd
+# import requests
+# import numpy as np
+# from sklearn.model_selection import train_test_split
+# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+# from sklearn.ensemble import GradientBoostingRegressor
+# from sklearn.preprocessing import StandardScaler
+# # Load data from API
+# api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
+# url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
+# headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
+# response = requests.get(url, headers=headers)
+# if response.status_code == 200:
+#     data = response.json()
+# else:
+#     print(f"Error {response.status_code}: {response.text}")
+# # Extract features
+# features = [feature['properties'] for feature in data['features']]
+# df = pd.DataFrame(features)
+# # Convert relevant columns to numeric
+# df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
+# df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
+# df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
+# df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
+# df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
+# df.dropna(inplace=True)
+# # Define features and target
+# features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
+# target = df['fuel']
+# # Split the data
+# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
+# # Gradient Boosting Regressor
+# model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
+# model.fit(features_train, target_train)
+# target_prediction = model.predict(features_test)
+# # Evaluate model performance
+# mse = mean_squared_error(target_test, target_prediction)
+# r2 = r2_score(target_test, target_prediction)
+# mae = mean_absolute_error(target_test, target_prediction)
+# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
+# # Future predictions
+# future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
+# predicted_fuel_future = model.predict([future_distance_nm])
+# # Print the results
+# print(f"Mean Squared Error: {mse}")
+# print(f"R-squared: {r2}")
+# print(f"Mean Absolute Error: {mae}")
+# print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
+# print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
+# # Comment on performance
+# if mse > 1000:  # Threshold can be adjusted
+#     print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
+# if r2 < 0:
+    # print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
+import pandas as pd
+import requests
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+from sklearn.ensemble import GradientBoostingRegressor
+from scipy.stats import f  # Importing the F-distribution
+# Load data from API
+api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
+url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
+headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
+response = requests.get(url, headers=headers)
+if response.status_code == 200:
+    data = response.json()
+else:
+    print(f"Error {response.status_code}: {response.text}")
+# Extract features
+features = [feature['properties'] for feature in data['features']]
+df = pd.DataFrame(features)
+# Convert relevant columns to numeric
+df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
+df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
+df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
+df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
+df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
+df.dropna(inplace=True)
+# Define features and target
+features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
+target = df['fuel']
+# Split the data
+features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
+# Gradient Boosting Regressor
+model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
+model.fit(features_train, target_train)
+target_prediction = model.predict(features_test)
+# Evaluate model performance
+mse = mean_squared_error(target_test, target_prediction)
+r2 = r2_score(target_test, target_prediction)
+mae = mean_absolute_error(target_test, target_prediction)
+average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
+# Future predictions
+future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
+predicted_fuel_future = model.predict([future_distance_nm])
+# Calculate F-statistic and p-value (if necessary)
+n_samples = len(target)
+n_predictors = features_train.shape[1]
+residual = n_samples - n_predictors - 1
+explained_variance = r2 * np.sum((target - np.mean(target))**2)
+unexplained_variance = mse * n_samples
+F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
+p_value = 1 - f.cdf(F_value, n_predictors, residual)
+# Print the results
+print(f"Mean Squared Error: {mse}")
+print(f"R-squared: {r2}")
+print(f"Mean Absolute Error: {mae}")
+print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
+print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
+print(f"p-value: {p_value:.4f}")
+print(f"F-statistic: {F_value:.2f}")
+# Comment on performance
+if mse > 1000:  # Threshold can be adjusted
+    print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
+if r2 < 0:
+    print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")

models/xgboost/inference.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+import joblib
+def load_data(file_path):
+    df = pd.read_csv(file_path)
+    df = df.drop(df.columns[0], axis=1)  # Drop the Unnamed: 0 column (Index column)
+    return df
+def load_model(model_path):
+    return joblib.load(model_path)
+def evaluate_model(df, model, selected_features, batch_size=100):
+    total_accuracy = 0
+    num_rows = len(df)
+    for start in range(0, num_rows, batch_size):
+        end = min(start + batch_size, num_rows)
+        batch_df = df.iloc[start:end]
+        fuel_burn_total = batch_df.pop('fuel_burn_total').values
+        batch_df = batch_df[selected_features]
+        predictions = model.predict(batch_df)
+        # Calculate accuracy for the current batch
+        accuracy = 1 - abs(fuel_burn_total - predictions) / fuel_burn_total
+        batch_accuracy = accuracy.mean()
+        total_accuracy += batch_accuracy * len(batch_df)
+        print(f'Processed rows {start + 1} to {end} out of {num_rows} rows')
+    average_accuracy = total_accuracy / num_rows
+    return average_accuracy
+def main():
+    data_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/test.csv'
+    model_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/xgboost_model.joblib'
+    selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
+                         'CAT', 'dist']
+    # Load data and model
+    df = load_data(data_file_path)
+    model = load_model(model_file_path)
+    # Evaluate the model
+    average_accuracy = evaluate_model(df, model, selected_features)
+    # Print the average accuracy
+    print(f'Average Accuracy: {average_accuracy:.2%}')
+if __name__ == "__main__":
+    main()

models/xgboost/model.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+import xgboost as xgb
+import joblib
+def load_data(file_path):
+    data = pd.read_csv(file_path)
+    data = data.reset_index(drop=True)
+    data = data.drop(data.columns[0], axis=1)  # Drop the Unnamed: 0 column
+    return data
+def preprocess_data(data, features_to_use, categorical_features, numerical_features):
+    # Preprocessing pipelines for both numeric and categorical features
+    numeric_transformer = Pipeline(steps=[
+        ('scaler', StandardScaler())
+    ])
+    categorical_transformer = Pipeline(steps=[
+        ('encoder', OneHotEncoder(handle_unknown='ignore'))
+    ])
+    # Combine preprocessing steps
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, numerical_features),
+            ('cat', categorical_transformer, categorical_features)
+        ])
+    # Split the datasets
+    X = data[features_to_use]
+    y = data['fuel_burn_total']
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    return X_train, X_test, y_train, y_test, preprocessor
+def train_model(X_train, y_train, preprocessor, best_params):
+    # Create the final model pipeline using the best parameters
+    final_pipeline = Pipeline(steps=[
+        ('preprocessor', preprocessor),
+        ('model', xgb.XGBRegressor(objective='reg:squarederror',
+                                   n_estimators=best_params['n_estimators'],
+                                   max_depth=best_params['max_depth'],
+                                   learning_rate=best_params['learning_rate'],
+                                   subsample=best_params['subsample'],
+                                   random_state=42))
+    ])
+    # Train the final model on the entire training datasets
+    final_pipeline.fit(X_train, y_train)
+    return final_pipeline
+def evaluate_model(model, X_test, y_test):
+    y_pred = model.predict(X_test)
+    mae = mean_absolute_error(y_test, y_pred)
+    mse = mean_squared_error(y_test, y_pred)
+    rmse = np.sqrt(mse)
+    return mae, rmse
+def save_model(model, model_path):
+    joblib.dump(model, model_path)
+def main():
+    data_file_path = '../../datasets/preprocessed_data.csv'
+    model_file_path = '../../saved_models/xgboost_model.joblib'
+    features_to_use = [
+        'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
+        'seats', 'distance',
+        'J/T', 'CAT', 'dist'
+    ]
+    # Identify categorical and numerical features
+    categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
+                            'J/T', 'CAT']
+    numerical_features = [col for col in features_to_use if col not in categorical_features]
+    # Load data
+    data = load_data(data_file_path)
+    # Preprocess the data
+    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
+                                                                     numerical_features)
+    # best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
+    # These hyperparameters were determined through a process of hyperparameter tuning.
+    #
+    # - 'n_estimators': determines the number of boosting rounds or trees to build.
+    # - 'max_depth': Maximum tree depth for base learners.
+    # - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
+    #       can improve generalization. Typical values range from 0.01 to 0.1.
+    # - 'subsample':  controls the fraction of observations used for each tree. A smaller subsample value results in
+    #       smaller and less complex models, which can help prevent overfitting.
+    best_params = {
+        'n_estimators': 400,
+        'max_depth': 20,
+        'learning_rate': 0.08,
+        'subsample': 0.9,
+    }
+    # Train the model
+    model = train_model(X_train, y_train, preprocessor, best_params)
+    # Evaluate the model
+    mae, rmse = evaluate_model(model, X_test, y_test)
+    print(f'MAE: {mae}')
+    print(f'RMSE: {rmse}')
+    # Save the final model
+    save_model(model, model_file_path)
+if __name__ == "__main__":
+    main()