poudel commited on
Commit
85b3623
1 Parent(s): 5728f18

Delete xgboost

Browse files
xgboost/gradient_boosting_regressor.py DELETED
@@ -1,240 +0,0 @@
1
- # # -*- coding: utf-8 -*-
2
- # """gradient_boosting_regressor.ipynb
3
-
4
- # Automatically generated by Colab.
5
-
6
- # Original file is located at
7
- # https://colab.research.google.com/drive/1hn_e3CJx3T9jqeSZjSgcW4Dybf8sD9q9
8
- # """
9
-
10
- # import pandas as pd
11
- # import requests
12
- # import numpy as np
13
- # from sklearn.linear_model import LinearRegression
14
- # from sklearn.model_selection import train_test_split
15
- # from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
16
- # from sklearn.ensemble import GradientBoostingRegressor
17
- # from sklearn.tree import DecisionTreeRegressor
18
- # from sklearn.preprocessing import StandardScaler, OneHotEncoder
19
-
20
- # # the dataset I am using is from RapidApi
21
- # api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
22
- # url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
23
- # headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
24
- # response = requests.get(url, headers = headers)
25
- # if response.status_code == 200:
26
- # data = response.json()
27
- # print(data)
28
- # else:
29
- # print({response.status_code}, {response.text})
30
- # # Note climbData and descendData is not being used since there is only one key entry for both features
31
-
32
- # # Gradient Boosting Regressor
33
- # # In here Im using the same .json dataset with a new model Gradient Boosting Regressor
34
-
35
- # data = response.json()
36
- # features = [feature['properties'] for feature in data['features']]
37
- # df = pd.DataFrame(features) # extracting features for the model
38
-
39
- # #print(df.columns)
40
-
41
- # # numeric
42
- # df['dist_km'] = pd.to_numeric(df['dist_km'], errors = 'coerce')
43
- # df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors = 'coerce')
44
- # df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors = 'coerce')
45
- # df['fuel'] = pd.to_numeric(df['fuel'], errors = 'coerce')
46
- # df['CO2'] = pd.to_numeric(df['CO2'], errors = 'coerce')
47
-
48
- # df.dropna(inplace = True)
49
- # features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']] # you can play with this and add more features I kept it simple with what I know is important
50
- # target = df['fuel']
51
-
52
- # features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 42) # split into train and test
53
-
54
- # model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 25, max_depth = 5, random_state = 42) # can play with the hyperparameters and observe model metrics
55
- # model.fit(features_train, target_train) # fitting the model
56
- # target_prediction = model.predict(features_test) # predictions
57
-
58
- # mse = mean_squared_error(target_test, target_prediction)
59
- # r2 = r2_score(target_test, target_prediction)
60
- # mae = mean_absolute_error(target_test, target_prediction)
61
- # average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
62
-
63
-
64
- # feature_we_want = len(target) # what we are looking for
65
- # regression = 1 # there is only one predictor
66
- # residual = feature_we_want - 2
67
- # explained_variance = r2 * np.sum((target - np.mean(target))**2)
68
- # unexplained_variance = mse * feature_we_want
69
-
70
- # F_value = (explained_variance / regression) / (unexplained_variance / residual) # calculating the F statistic for the report purposes
71
- # p_value = 1 - f.cdf(F_value, regression, residual)
72
- # rse = np.sqrt(mse)
73
-
74
- # future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92] # you can change the value of future_distance_nm ['dist_km', 'cruiseTime', 'CO2', 'dist_nm']
75
- # predicted_fuel_future = model.predict([future_distance_nm]) # you will need predicted_fuel
76
- # average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean() # can change to "dist_km" to see the average in km
77
-
78
- # print(f"mean squared error: {mse}") # checking the model perfomance
79
- # print(f"R-squared: {r2}")
80
- # print(f"mean absolute error: {mae}")
81
- # print(f"average fuel consumption per nautical mile:: {average_predicted_fuel_per_nm:.2f} for XGBoost model")
82
- # print(f"regression: {regression:.4f}")
83
- # print(f"residual: {residual:.4f}")
84
- # print(f"p-value: {p_value:.4f}") # calculating P value for the report
85
- # print(f"standard error: {rse:.2f}")
86
- # print(f"f-statistic: {F_value:.2f}")
87
- # print(f"predicted fuel needed for a {future_distance_nm} nm flight: {predicted_fuel_future[0]:.2f} kg")
88
-
89
- # # seems like the mse is verrryyy highhhhhhh but this chnages if we add or take off features
90
- # # the Rsquare and mae have same numbers as the linear resseion model so thats good
91
-
92
- # # added more features I am now playing with the hyperparameters the metrics go up and down based of the hyperparameters
93
-
94
- # # mse really high, this is a bad model, rquare is a negative number
95
-
96
- # import pandas as pd
97
- # import requests
98
- # import numpy as np
99
- # from sklearn.model_selection import train_test_split
100
- # from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
101
- # from sklearn.ensemble import GradientBoostingRegressor
102
- # from sklearn.preprocessing import StandardScaler
103
-
104
- # # Load data from API
105
- # api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
106
- # url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
107
- # headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
108
- # response = requests.get(url, headers=headers)
109
-
110
- # if response.status_code == 200:
111
- # data = response.json()
112
- # else:
113
- # print(f"Error {response.status_code}: {response.text}")
114
-
115
- # # Extract features
116
- # features = [feature['properties'] for feature in data['features']]
117
- # df = pd.DataFrame(features)
118
-
119
- # # Convert relevant columns to numeric
120
- # df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
121
- # df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
122
- # df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
123
- # df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
124
- # df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
125
-
126
- # df.dropna(inplace=True)
127
-
128
- # # Define features and target
129
- # features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
130
- # target = df['fuel']
131
-
132
- # # Split the data
133
- # features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
134
-
135
- # # Gradient Boosting Regressor
136
- # model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
137
- # model.fit(features_train, target_train)
138
- # target_prediction = model.predict(features_test)
139
-
140
- # # Evaluate model performance
141
- # mse = mean_squared_error(target_test, target_prediction)
142
- # r2 = r2_score(target_test, target_prediction)
143
- # mae = mean_absolute_error(target_test, target_prediction)
144
- # average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
145
-
146
- # # Future predictions
147
- # future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
148
- # predicted_fuel_future = model.predict([future_distance_nm])
149
-
150
- # # Print the results
151
- # print(f"Mean Squared Error: {mse}")
152
- # print(f"R-squared: {r2}")
153
- # print(f"Mean Absolute Error: {mae}")
154
- # print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
155
- # print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
156
-
157
- # # Comment on performance
158
- # if mse > 1000: # Threshold can be adjusted
159
- # print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
160
- # if r2 < 0:
161
- # print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
162
-
163
- import pandas as pd
164
- import requests
165
- import numpy as np
166
- from sklearn.model_selection import train_test_split
167
- from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
168
- from sklearn.ensemble import GradientBoostingRegressor
169
- from scipy.stats import f # Importing the F-distribution
170
-
171
- # Load data from API
172
- api_key = '93844a03b8msh43e83be923422abp10fb67jsne048c3017988'
173
- url = 'https://fliteroute.p.rapidapi.com/api/gcfuelandtime/origin/GVA/dest/MIA/model/A320'
174
- headers = {'x-rapidapi-host': 'fliteroute.p.rapidapi.com', 'x-rapidapi-key': api_key}
175
- response = requests.get(url, headers=headers)
176
-
177
- if response.status_code == 200:
178
- data = response.json()
179
- else:
180
- print(f"Error {response.status_code}: {response.text}")
181
-
182
- # Extract features
183
- features = [feature['properties'] for feature in data['features']]
184
- df = pd.DataFrame(features)
185
-
186
- # Convert relevant columns to numeric
187
- df['dist_km'] = pd.to_numeric(df['dist_km'], errors='coerce')
188
- df['dist_nm'] = pd.to_numeric(df['dist_nm'], errors='coerce')
189
- df['cruiseTime'] = pd.to_numeric(df['cruiseTime'], errors='coerce')
190
- df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
191
- df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
192
-
193
- df.dropna(inplace=True)
194
-
195
- # Define features and target
196
- features = df[['dist_km', 'cruiseTime', 'dist_nm', 'CO2', 'fuel']]
197
- target = df['fuel']
198
-
199
- # Split the data
200
- features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
201
-
202
- # Gradient Boosting Regressor
203
- model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
204
- model.fit(features_train, target_train)
205
- target_prediction = model.predict(features_test)
206
-
207
- # Evaluate model performance
208
- mse = mean_squared_error(target_test, target_prediction)
209
- r2 = r2_score(target_test, target_prediction)
210
- mae = mean_absolute_error(target_test, target_prediction)
211
- average_predicted_fuel_per_nm = (target_prediction / features_test['dist_nm']).mean()
212
-
213
- # Future predictions
214
- future_distance_nm = [30.90, 40, 1894.34, 23.9, 17.92]
215
- predicted_fuel_future = model.predict([future_distance_nm])
216
-
217
- # Calculate F-statistic and p-value (if necessary)
218
- n_samples = len(target)
219
- n_predictors = features_train.shape[1]
220
- residual = n_samples - n_predictors - 1
221
- explained_variance = r2 * np.sum((target - np.mean(target))**2)
222
- unexplained_variance = mse * n_samples
223
-
224
- F_value = (explained_variance / n_predictors) / (unexplained_variance / residual)
225
- p_value = 1 - f.cdf(F_value, n_predictors, residual)
226
-
227
- # Print the results
228
- print(f"Mean Squared Error: {mse}")
229
- print(f"R-squared: {r2}")
230
- print(f"Mean Absolute Error: {mae}")
231
- print(f"Average Fuel Consumption per Nautical Mile: {average_predicted_fuel_per_nm:.2f} kg")
232
- print(f"Predicted Fuel Needed for a {future_distance_nm} nm Flight: {predicted_fuel_future[0]:.2f} kg")
233
- print(f"p-value: {p_value:.4f}")
234
- print(f"F-statistic: {F_value:.2f}")
235
-
236
- # Comment on performance
237
- if mse > 1000: # Threshold can be adjusted
238
- print("Warning: The MSE is very high, indicating that the model might not be performing well. Consider tuning hyperparameters.")
239
- if r2 < 0:
240
- print("Warning: The R-squared value is negative, which suggests that the model is worse than a simple mean prediction.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
xgboost/inference.py DELETED
@@ -1,58 +0,0 @@
1
- import pandas as pd
2
- import joblib
3
-
4
-
5
- def load_data(file_path):
6
- df = pd.read_csv(file_path)
7
- df = df.drop(df.columns[0], axis=1) # Drop the Unnamed: 0 column (Index column)
8
- return df
9
-
10
-
11
- def load_model(model_path):
12
- return joblib.load(model_path)
13
-
14
-
15
- def evaluate_model(df, model, selected_features, batch_size=100):
16
- total_accuracy = 0
17
- num_rows = len(df)
18
-
19
- for start in range(0, num_rows, batch_size):
20
- end = min(start + batch_size, num_rows)
21
- batch_df = df.iloc[start:end]
22
-
23
- fuel_burn_total = batch_df.pop('fuel_burn_total').values
24
- batch_df = batch_df[selected_features]
25
-
26
- predictions = model.predict(batch_df)
27
-
28
- # Calculate accuracy for the current batch
29
- accuracy = 1 - abs(fuel_burn_total - predictions) / fuel_burn_total
30
- batch_accuracy = accuracy.mean()
31
- total_accuracy += batch_accuracy * len(batch_df)
32
-
33
- print(f'Processed rows {start + 1} to {end} out of {num_rows} rows')
34
-
35
- average_accuracy = total_accuracy / num_rows
36
- return average_accuracy
37
-
38
-
39
- def main():
40
- data_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/datasets/test.csv'
41
- model_file_path = '/Users/ashishpoudel/Downloads/AircraftFuelPrediction-main/saved_models/xgboost_model.joblib'
42
-
43
- selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
44
- 'CAT', 'dist']
45
-
46
- # Load data and model
47
- df = load_data(data_file_path)
48
- model = load_model(model_file_path)
49
-
50
- # Evaluate the model
51
- average_accuracy = evaluate_model(df, model, selected_features)
52
-
53
- # Print the average accuracy
54
- print(f'Average Accuracy: {average_accuracy:.2%}')
55
-
56
-
57
- if __name__ == "__main__":
58
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
xgboost/model.py DELETED
@@ -1,125 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.metrics import mean_absolute_error, mean_squared_error
4
- from sklearn.model_selection import train_test_split
5
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
6
- from sklearn.compose import ColumnTransformer
7
- from sklearn.pipeline import Pipeline
8
- import xgboost as xgb
9
- import joblib
10
-
11
-
12
- def load_data(file_path):
13
- data = pd.read_csv(file_path)
14
- data = data.reset_index(drop=True)
15
- data = data.drop(data.columns[0], axis=1) # Drop the Unnamed: 0 column
16
- return data
17
-
18
-
19
- def preprocess_data(data, features_to_use, categorical_features, numerical_features):
20
- # Preprocessing pipelines for both numeric and categorical features
21
- numeric_transformer = Pipeline(steps=[
22
- ('scaler', StandardScaler())
23
- ])
24
-
25
- categorical_transformer = Pipeline(steps=[
26
- ('encoder', OneHotEncoder(handle_unknown='ignore'))
27
- ])
28
-
29
- # Combine preprocessing steps
30
- preprocessor = ColumnTransformer(
31
- transformers=[
32
- ('num', numeric_transformer, numerical_features),
33
- ('cat', categorical_transformer, categorical_features)
34
- ])
35
-
36
- # Split the datasets
37
- X = data[features_to_use]
38
- y = data['fuel_burn_total']
39
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
40
-
41
- return X_train, X_test, y_train, y_test, preprocessor
42
-
43
-
44
- def train_model(X_train, y_train, preprocessor, best_params):
45
- # Create the final model pipeline using the best parameters
46
- final_pipeline = Pipeline(steps=[
47
- ('preprocessor', preprocessor),
48
- ('model', xgb.XGBRegressor(objective='reg:squarederror',
49
- n_estimators=best_params['n_estimators'],
50
- max_depth=best_params['max_depth'],
51
- learning_rate=best_params['learning_rate'],
52
- subsample=best_params['subsample'],
53
- random_state=42))
54
- ])
55
-
56
- # Train the final model on the entire training datasets
57
- final_pipeline.fit(X_train, y_train)
58
-
59
- return final_pipeline
60
-
61
-
62
- def evaluate_model(model, X_test, y_test):
63
- y_pred = model.predict(X_test)
64
-
65
- mae = mean_absolute_error(y_test, y_pred)
66
- mse = mean_squared_error(y_test, y_pred)
67
- rmse = np.sqrt(mse)
68
- return mae, rmse
69
-
70
-
71
- def save_model(model, model_path):
72
- joblib.dump(model, model_path)
73
-
74
-
75
- def main():
76
- data_file_path = '../../datasets/preprocessed_data.csv'
77
- model_file_path = '../../saved_models/xgboost_model.joblib'
78
-
79
- features_to_use = [
80
- 'Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
81
- 'seats', 'distance',
82
- 'J/T', 'CAT', 'dist'
83
- ]
84
-
85
- # Identify categorical and numerical features
86
- categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer',
87
- 'J/T', 'CAT']
88
- numerical_features = [col for col in features_to_use if col not in categorical_features]
89
-
90
- # Load data
91
- data = load_data(data_file_path)
92
-
93
- # Preprocess the data
94
- X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, features_to_use, categorical_features,
95
- numerical_features)
96
-
97
- # best_params is a dictionary that holds the optimal hyperparameters for the XGBoost model.
98
- # These hyperparameters were determined through a process of hyperparameter tuning.
99
- #
100
- # - 'n_estimators': determines the number of boosting rounds or trees to build.
101
- # - 'max_depth': Maximum tree depth for base learners.
102
- # - 'learning_rate': controls the shrinkage of each tree's contribution. Smaller values require more iterations but
103
- # can improve generalization. Typical values range from 0.01 to 0.1.
104
- # - 'subsample': controls the fraction of observations used for each tree. A smaller subsample value results in
105
- # smaller and less complex models, which can help prevent overfitting.
106
- best_params = {
107
- 'n_estimators': 400,
108
- 'max_depth': 20,
109
- 'learning_rate': 0.08,
110
- 'subsample': 0.9,
111
- }
112
-
113
- # Train the model
114
- model = train_model(X_train, y_train, preprocessor, best_params)
115
-
116
- # Evaluate the model
117
- mae, rmse = evaluate_model(model, X_test, y_test)
118
- print(f'MAE: {mae}')
119
- print(f'RMSE: {rmse}')
120
- # Save the final model
121
- save_model(model, model_file_path)
122
-
123
-
124
- if __name__ == "__main__":
125
- main()