import pandas as pd import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, Input from tensorflow.keras.callbacks import EarlyStopping from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import joblib def load_data(file_path): return pd.read_csv(file_path) def preprocess_data(data, selected_features, categorical_features, numerical_features): # Define preprocessing pipelines numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('encoder', OneHotEncoder(handle_unknown='ignore')) ]) # Combine preprocessing steps preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numerical_features), ('cat', categorical_transformer, categorical_features) ]) # Split the datasets X = data[selected_features] y = data['fuel_burn_total'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Preprocess the datasets X_train = preprocessor.fit_transform(X_train) X_test = preprocessor.transform(X_test) return X_train, X_test, y_train, y_test, preprocessor def build_model(input_shape): model = Sequential([ Input(shape=(input_shape,)), Dense(64, activation='relu'), Dense(64, activation='relu'), Dense(1) ]) model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae']) return model def train_model(model, X_train, y_train, epochs=50, batch_size=32, patience=10, validation_split=0.2): """ Trains the provided model using the training data. Parameters: model (tensorflow.keras.Model): The model to be trained. X_train (numpy.ndarray): The training data. y_train (numpy.ndarray): The target values for the training data. epochs (int, optional): The number of epochs to train the model. Default is 50. batch_size (int, optional): The number of samples per gradient update. Default is 32. patience (int, optional): Number of epochs with no improvement after which training will be stopped. Default is 10. validation_split (float, optional): Fraction of the training data to be used as validation data. Default is 0.2. Returns: model (tensorflow.keras.Model): The trained model. history (tensorflow.python.keras.callbacks.History): A record of training loss values and metrics values at successive epochs. """ early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True) history = model.fit(X_train, y_train, validation_split=validation_split, epochs=epochs, callbacks=[early_stopping], batch_size=batch_size) return model, history def evaluate_model(model, X_test, y_test): y_pred = model.predict(X_test) mae = np.mean(np.abs(y_test - y_pred.flatten())) rmse = np.sqrt(np.mean((y_test - y_pred.flatten()) ** 2)) return mae, rmse def save_model(model, preprocessor, model_path, preprocessor_path): model.save(model_path) joblib.dump(preprocessor, preprocessor_path) def main(): data = load_data('../../datasets/preprocessed_data.csv') selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T', 'CAT', 'dist'] categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'J/T', 'CAT'] numerical_features = ['seats', 'distance', 'dist'] X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, selected_features, categorical_features, numerical_features) model = build_model(X_train.shape[1]) model, history = train_model(model, X_train, y_train) mae, rmse = evaluate_model(model, X_test, y_test) print(f'MAE: {mae}') print(f'RMSE: {rmse}') save_model(model, preprocessor, '../../saved_models/nn_model.keras', '../../saved_models/nn_preprocessor.pkl') if __name__ == "__main__": import os os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' main()