Spaces:

poudel
/

AircraftFuelPredictorV2

Runtime error

App Files Files Community

poudel commited on 27 days ago

Commit

85b3623

•

1 Parent(s): 5728f18

Delete xgboost

Browse files

Files changed (3) hide show

xgboost/gradient_boosting_regressor.py +0 -240
xgboost/inference.py +0 -58
xgboost/model.py +0 -125

xgboost/gradient_boosting_regressor.py DELETED Viewed

@@ -1,240 +0,0 @@
-# # -*- coding: utf-8 -*-
-# """gradient_boosting_regressor.ipynb
-# Automatically generated by Colab.
-# Original file is located at
-#     https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
-# """
-# import pandas as pd
-# import requests
-# import numpy as np
-# from sklearn.linear_model import LinearRegression
-# from sklearn.model_selection import train_test_split
-# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
-# from sklearn.ensemble import GradientBoostingRegressor
-# from sklearn.tree import DecisionTreeRegressor
-# from sklearn.preprocessing import StandardScaler, OneHotEncoder
-# # the dataset I am using is from RapidApi
-# api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
-# url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
-# headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
-# response = requests.get(url, headers = headers)
-# if response.status_code == 200:
-#     data = response.json()
-#     print(data)
-# else:
-#     print({response.status_code}, {response.text})
-# # Note climbData and descendData is not being used since there is only one key entry for both features
-# # Gradient Boosting Regressor
-# # In here Im using the same .json dataset with a new model Gradient Boosting Regressor
-# data = response.json()
-# features = [feature['properties'] for feature in data['features']]
-# df = pd.DataFrame(features)    # extracting features for the model
-# #print(df.columns)
-# # numeric
-# df['dist_km'] = pd.to_numeric(df['dist_km'], errors = 'coerce')
-# df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors = 'coerce')
-# df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors = 'coerce')
-# df['fuel'] = pd.to_numeric(df['fuel'], errors = 'coerce')
-# df['CO2'] = pd.to_numeric(df['CO2'], errors = 'coerce')
-# df.dropna(inplace = True)
-# features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]  # you can play with this and add more features I kept it simple with what I know is important
-# target = df['fuel']
-# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 42)  # split into train and test
-# model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 25, max_depth = 5, random_state = 42)   # can play with the hyperparameters and observe model metrics
-# model.fit(features_train, target_train)     # fitting the model
-# target_prediction = model.predict(features_test)   # predictions
-# mse = mean_squared_error(target_test, target_prediction)
-# r2 = r2_score(target_test, target_prediction)
-# mae = mean_absolute_error(target_test, target_prediction)
-# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
-# feature_we_want = len(target) # what we are looking for
-# regression = 1  # there is only one predictor
-# residual = feature_we_want - 2
-# explained_variance = r2 * np.sum((target - np.mean(target))**2)
-# unexplained_variance = mse * feature_we_want
-# F_value = (explained_variance / regression) / (unexplained_variance / residual)  # calculating the F statistic for the report purposes
-# p_value = 1 - f.cdf(F_value, regression, residual)
-# rse = np.sqrt(mse)
-# future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]  # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
-# predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
-# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()     # can change to "dist_km" to see the average in km
-# print(f"mean squared error: {mse}")   # checking the model perfomance
-# print(f"R-squared: {r2}")
-# print(f"mean absolute error: {mae}")
-# print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for XGBoost model")
-# print(f"regression: {regression:.4f}")
-# print(f"residual: {residual:.4f}")
-# print(f"p-value: {p_value:.4f}")   # calculating P value for the report
-# print(f"standard error: {rse:.2f}")
-# print(f"f-statistic: {F_value:.2f}")
-# print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
-# # seems like the mse is verrryyy highhhhhhh but this chnages if we add or take off features
-# # the Rsquare and mae have same numbers as the linear resseion model so thats good
-# # added more features I am now playing with the hyperparameters the metrics go up and down based of the hyperparameters
-# # mse really high, this is a bad model, rquare is a negative number
-# import pandas as pd
-# import requests
-# import numpy as np
-# from sklearn.model_selection import train_test_split
-# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
-# from sklearn.ensemble import GradientBoostingRegressor
-# from sklearn.preprocessing import StandardScaler
-# # Load data from API
-# api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
-# url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
-# headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
-# response = requests.get(url, headers=headers)
-# if response.status_code == 200:
-#     data = response.json()
-# else:
-#     print(f"Error {response.status_code}: {response.text}")
-# # Extract features
-# features = [feature['properties'] for feature in data['features']]
-# df = pd.DataFrame(features)
-# # Convert relevant columns to numeric
-# df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
-# df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
-# df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
-# df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
-# df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
-# df.dropna(inplace=True)
-# # Define features and target
-# features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
-# target = df['fuel']
-# # Split the data
-# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
-# # Gradient Boosting Regressor
-# model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
-# model.fit(features_train, target_train)
-# target_prediction = model.predict(features_test)
-# # Evaluate model performance
-# mse = mean_squared_error(target_test, target_prediction)
-# r2 = r2_score(target_test, target_prediction)
-# mae = mean_absolute_error(target_test, target_prediction)
-# average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
-# # Future predictions
-# future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
-# predicted_fuel_future = model.predict([future_distance_nm])
-# # Print the results
-# print(f"Mean Squared Error: {mse}")
-# print(f"R-squared: {r2}")
-# print(f"Mean Absolute Error: {mae}")
-# print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
-# print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
-# # Comment on performance
-# if mse > 1000:  # Threshold can be adjusted
-#     print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
-# if r2 < 0:
-    # print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
-import pandas as pd
-import requests
-import numpy as np
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
-from sklearn.ensemble import GradientBoostingRegressor
-from scipy.stats import f  # Importing the F-distribution
-# Load data from API
-api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
-url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
-headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
-response = requests.get(url, headers=headers)
-if response.status_code == 200:
-    data = response.json()
-else:
-    print(f"Error {response.status_code}: {response.text}")
-# Extract features
-features = [feature['properties'] for feature in data['features']]
-df = pd.DataFrame(features)
-# Convert relevant columns to numeric
-df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
-df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
-df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
-df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
-df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
-df.dropna(inplace=True)
-# Define features and target
-features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
-target = df['fuel']
-# Split the data
-features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
-# Gradient Boosting Regressor
-model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
-model.fit(features_train, target_train)
-target_prediction = model.predict(features_test)
-# Evaluate model performance
-mse = mean_squared_error(target_test, target_prediction)
-r2 = r2_score(target_test, target_prediction)
-mae = mean_absolute_error(target_test, target_prediction)
-average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
-# Future predictions
-future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
-predicted_fuel_future = model.predict([future_distance_nm])
-# Calculate F-statistic and p-value (if necessary)
-n_samples = len(target)
-n_predictors = features_train.shape[1]
-residual = n_samples - n_predictors - 1
-explained_variance = r2 * np.sum((target - np.mean(target))**2)
-unexplained_variance = mse * n_samples
-F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
-p_value = 1 - f.cdf(F_value, n_predictors, residual)
-# Print the results
-print(f"Mean Squared Error: {mse}")
-print(f"R-squared: {r2}")
-print(f"Mean Absolute Error: {mae}")
-print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
-print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
-print(f"p-value: {p_value:.4f}")
-print(f"F-statistic: {F_value:.2f}")
-# Comment on performance
-if mse > 1000:  # Threshold can be adjusted
-    print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
-if r2 < 0:
-    print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")

xgboost/inference.py DELETED Viewed

@@ -1,58 +0,0 @@
-import pandas as pd
-import joblib
-def load_data(file_path):
-    df = pd.read_csv(file_path)
-    df = df.drop(df.columns[0], axis=1)  # Drop the Unnamed: 0 column (Index column)
-    return df
-def load_model(model_path):
-    return joblib.load(model_path)
-def evaluate_model(df, model, selected_features, batch_size=100):
-    total_accuracy = 0
-    num_rows = len(df)
-    for start in range(0, num_rows, batch_size):
-        end = min(start + batch_size, num_rows)
-        batch_df = df.iloc[start:end]
-        fuel_burn_total = batch_df.pop('fuel_burn_total').values
-        batch_df = batch_df[selected_features]
-        predictions = model.predict(batch_df)
-        # Calculate accuracy for the current batch
-        accuracy = 1 - abs(fuel_burn_total - predictions) / fuel_burn_total
-        batch_accuracy = accuracy.mean()
-        total_accuracy += batch_accuracy * len(batch_df)
-        print(f'Processed rows {start + 1} to {end} out of {num_rows} rows')
-    average_accuracy = total_accuracy / num_rows
-    return average_accuracy
-def main():
-    data_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/test.csv'
-    model_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/xgboost_model.joblib'
-    selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
-                         'CAT', 'dist']
-    # Load data and model
-    df = load_data(data_file_path)
-    model = load_model(model_file_path)
-    # Evaluate the model
-    average_accuracy = evaluate_model(df, model, selected_features)
-    # Print the average accuracy
-    print(f'Average Accuracy: {average_accuracy:.2%}')
-if __name__ == "__main__":
-    main()

xgboost/model.py DELETED Viewed

@@ -1,125 +0,0 @@
-import pandas as pd
-import numpy as np
-from sklearn.metrics import mean_absolute_error, mean_squared_error
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import Pipeline
-import xgboost as xgb
-import joblib
-def load_data(file_path):
-    data = pd.read_csv(file_path)
-    data = data.reset_index(drop=True)
-    data = data.drop(data.columns[0], axis=1)  # Drop the Unnamed: 0 column
-    return data
-def preprocess_data(data, features_to_use, categorical_features, numerical_features):
-    # Preprocessing pipelines for both numeric and categorical features
-    numeric_transformer = Pipeline(steps=[
-        ('scaler', StandardScaler())
-    ])
-    categorical_transformer = Pipeline(steps=[
-        ('encoder', OneHotEncoder(handle_unknown='ignore'))
-    ])
-    # Combine preprocessing steps
-    preprocessor = ColumnTransformer(
-        transformers=[
-            ('num', numeric_transformer, numerical_features),
-            ('cat', categorical_transformer, categorical_features)
-        ])
-    # Split the datasets
-    X = data[features_to_use]
-    y = data['fuel_burn_total']
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    return X_train, X_test, y_train, y_test, preprocessor
-def train_model(X_train, y_train, preprocessor, best_params):
-    # Create the final model pipeline using the best parameters
-    final_pipeline = Pipeline(steps=[
-        ('preprocessor', preprocessor),
-        ('model', xgb.XGBRegressor(objective='reg:squarederror',
-                                   n_estimators=best_params['n_estimators'],
-                                   max_depth=best_params['max_depth'],
-                                   learning_rate=best_params['learning_rate'],
-                                   subsample=best_params['subsample'],
-                                   random_state=42))
-    ])
-    # Train the final model on the entire training datasets
-    final_pipeline.fit(X_train, y_train)
-    return final_pipeline
-def evaluate_model(model, X_test, y_test):
-    y_pred = model.predict(X_test)
-    mae = mean_absolute_error(y_test, y_pred)
-    mse = mean_squared_error(y_test, y_pred)
-    rmse = np.sqrt(mse)
-    return mae, rmse
-def save_model(model, model_path):
-    joblib.dump(model, model_path)
-def main():
-    data_file_path = '../../datasets/preprocessed_data.csv'
-    model_file_path = '../../saved_models/xgboost_model.joblib'
-    features_to_use = [
-        'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
-        'seats', 'distance',
-        'J/T', 'CAT', 'dist'
-    ]
-    # Identify categorical and numerical features
-    categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
-                            'J/T', 'CAT']
-    numerical_features = [col for col in features_to_use if col not in categorical_features]
-    # Load data
-    data = load_data(data_file_path)
-    # Preprocess the data
-    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
-                                                                     numerical_features)
-    # best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
-    # These hyperparameters were determined through a process of hyperparameter tuning.
-    #
-    # - 'n_estimators': determines the number of boosting rounds or trees to build.
-    # - 'max_depth': Maximum tree depth for base learners.
-    # - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
-    #       can improve generalization. Typical values range from 0.01 to 0.1.
-    # - 'subsample':  controls the fraction of observations used for each tree. A smaller subsample value results in
-    #       smaller and less complex models, which can help prevent overfitting.
-    best_params = {
-        'n_estimators': 400,
-        'max_depth': 20,
-        'learning_rate': 0.08,
-        'subsample': 0.9,
-    }
-    # Train the model
-    model = train_model(X_train, y_train, preprocessor, best_params)
-    # Evaluate the model
-    mae, rmse = evaluate_model(model, X_test, y_test)
-    print(f'MAE: {mae}')
-    print(f'RMSE: {rmse}')
-    # Save the final model
-    save_model(model, model_file_path)
-if __name__ == "__main__":
-    main()