File size: 4,421 Bytes
f637442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


def load_data(file_path):
    return pd.read_csv(file_path)


def preprocess_data(data, selected_features, categorical_features, numerical_features):
    # Define preprocessing pipelines
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Split the datasets
    X = data[selected_features]
    y = data['fuel_burn_total']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Preprocess the datasets
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    return X_train, X_test, y_train, y_test, preprocessor


def build_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    return model


def train_model(model, X_train, y_train, epochs=50, batch_size=32, patience=10, validation_split=0.2):
    """
        Trains the provided model using the training data.

        Parameters:
        model (tensorflow.keras.Model): The model to be trained.
        X_train (numpy.ndarray): The training data.
        y_train (numpy.ndarray): The target values for the training data.
        epochs (int, optional): The number of epochs to train the model. Default is 50.
        batch_size (int, optional): The number of samples per gradient update. Default is 32.
        patience (int, optional): Number of epochs with no improvement after which training will be stopped. Default is 10.
        validation_split (float, optional): Fraction of the training data to be used as validation data. Default is 0.2.

        Returns:
        model (tensorflow.keras.Model): The trained model.
        history (tensorflow.python.keras.callbacks.History): A record of training loss values and metrics values at successive epochs.
        """
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
    history = model.fit(X_train, y_train, validation_split=validation_split, epochs=epochs, callbacks=[early_stopping],
                        batch_size=batch_size)

    return model, history


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = np.mean(np.abs(y_test - y_pred.flatten()))
    rmse = np.sqrt(np.mean((y_test - y_pred.flatten()) ** 2))
    return mae, rmse


def save_model(model, preprocessor, model_path, preprocessor_path):
    model.save(model_path)
    joblib.dump(preprocessor, preprocessor_path)


def main():
    data = load_data('../../datasets/preprocessed_data.csv')
    selected_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'seats', 'distance', 'J/T',
                         'CAT', 'dist']

    categorical_features = ['Origin_Airport', 'Destination_Airport', 'model', '_Manufacturer', 'J/T', 'CAT']
    numerical_features = ['seats', 'distance', 'dist']

    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data, selected_features, categorical_features,
                                                                     numerical_features)

    model = build_model(X_train.shape[1])

    model, history = train_model(model, X_train, y_train)

    mae, rmse = evaluate_model(model, X_test, y_test)
    print(f'MAE: {mae}')
    print(f'RMSE: {rmse}')

    save_model(model, preprocessor, '../../saved_models/nn_model.keras', '../../saved_models/nn_preprocessor.pkl')


if __name__ == "__main__":
    import os

    os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
    main()