File size: 10,747 Bytes
5ff507b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# train.py

import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold
from transformers import Trainer, TrainingArguments
from sklearn.metrics import ndcg_score
import json

from data_processing import load_data, EmbeddingGenerator, prepare_labels
from utils import compute_ndcg
import numpy as np
from sklearn.metrics import ndcg_score, mean_squared_error

# Generate random predictions based on label distribution
def generate_random_predictions(y_true):
    return np.random.uniform(y_true.min(), y_true.max(), size=y_true.shape)

# Evaluate relative lift
def calculate_relative_lift(y_true, model_predictions, metric="ndcg"):
    random_predictions = generate_random_predictions(y_true)
    
    if metric == "ndcg":
        model_score = ndcg_score([y_true], [model_predictions])
        random_score = ndcg_score([y_true], [random_predictions])
        lift = (model_score - random_score) / random_score
    elif metric == "mse":
        model_score = mean_squared_error(y_true, model_predictions)
        random_score = mean_squared_error(y_true, random_predictions)
        lift = (random_score - model_score) / random_score
    else:
        raise ValueError("Unsupported metric")
    
    return lift, model_score, random_score

# Define your model architecture
class MultiOutputRegressor(nn.Module):
    def __init__(self, hidden_size, num_outputs):
        super(MultiOutputRegressor, self).__init__()
        self.regressor_head = nn.Linear(hidden_size, num_outputs)
    
    def forward(self, input_ids):
        return self.regressor_head(input_ids)

# Dataset class
class EmbeddingDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return {"input_ids": self.embeddings[idx], "label": self.labels[idx]}

# Custom data collator
class CustomDataCollator:
    def __call__(self, features):
        embeddings = torch.stack([item["input_ids"] for item in features])
        labels = torch.stack([item["label"] for item in features])
        batch_data = {"input_ids": embeddings, "label": labels}
        return batch_data

# Custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        input_ids = inputs["input_ids"].to(self.args.device)
        labels = inputs["label"].to(self.args.device)
        outputs = model(input_ids)
        loss_fct = nn.MSELoss()
        loss = loss_fct(outputs, labels)
        return (loss, outputs) if return_outputs else loss

def main():
    # Load data
    outdata = load_data("labeled_users.json")

    # Extract descriptions
    descriptions = [record['description'] for record in outdata]

    # Generate embeddings
    embedder = EmbeddingGenerator()
    X_embeddings = embedder.generate_embeddings(descriptions)

    # Prepare labels
    y_matrix, label2id, id2label = prepare_labels(outdata)

    # Save label mappings for later use
    mappings = {'label2id': label2id, 'id2label': id2label}
    with open('label_mappings.json', 'w') as f:
        json.dump(mappings, f)

    # K-Fold Cross Validation
    train_embeddings = torch.tensor(X_embeddings, dtype=torch.float)
    train_labels = torch.tensor(y_matrix, dtype=torch.float)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    data_collator = CustomDataCollator()

    n_splits = 5  # Number of folds
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    hidden_size = train_embeddings.shape[1]
    num_outputs = train_labels.shape[1]
    fold_ndcg_scores = []
    all_preds = []

    for fold, (train_index, val_index) in enumerate(kf.split(train_embeddings)):
        print(f"Fold {fold + 1}/{n_splits}")
    
        # Split data into training and validation sets
        X_train_fold = train_embeddings[train_index]
        y_train_fold = train_labels[train_index]
        X_val_fold = train_embeddings[val_index]
        y_val_fold = train_labels[val_index]
    
        # Create datasets
        train_dataset = EmbeddingDataset(X_train_fold, y_train_fold)
        val_dataset = EmbeddingDataset(X_val_fold, y_val_fold)
    
        # Initialize the model
        model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
        model.to(device)
    
        # Training arguments
        training_args = TrainingArguments(
            output_dir=f"./results_fold_{fold+1}",
            num_train_epochs=10,
            per_device_train_batch_size=64,
            logging_dir=f"./logs_fold_{fold+1}",
            evaluation_strategy="no",  # No evaluation during training
            save_strategy="no",  # Not saving checkpoints
            disable_tqdm=True,  # Disable progress bar
            learning_rate=1e-5,
            weight_decay=0.01,  # Apply a small weight decay
            max_grad_norm=1.0  # Clip gradients to 1.0
        )
        # Initialize Trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=data_collator,
        )
        # Train the model
        trainer.train()
    
        # Evaluate the model on the validation set
        val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
    
        fold_preds = []
        fold_labels = []
    
        model.eval()
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["label"].to(device)
                outputs = model(input_ids)
                fold_preds.append(outputs.cpu().numpy())
                fold_labels.append(labels.cpu().numpy())
    
        # Concatenate all predictions and labels for the fold
        y_pred = np.concatenate(fold_preds, axis=0)
        y_true = np.concatenate(fold_labels, axis=0)
    
        # Append fold predictions to all_preds
        all_preds.extend(y_pred)
    
        # Compute NDCG scores
        all_ndcgs = []
        lifts = []
        for i in range(len(y_true)):
            actual_weights = y_true[i]
            predicted_weights = y_pred[i]
            ndcg = ndcg_score([actual_weights], [predicted_weights])
            lift, model_score, random_score = calculate_relative_lift(actual_weights, predicted_weights, metric="ndcg")
            lifts.append(lift)
            all_ndcgs.append(ndcg)
    
        # Average NDCG score for the current fold
        if all_ndcgs:
            avg_ndcg = np.mean(all_ndcgs)
        else:
            avg_ndcg = 0.0  # Handle cases where there are no non-zero weights
        if lifts:
            avg_lift = np.mean(lifts)
        else:
            avg_lift = 0.0  # Handle cases where there are no non-zero weights
        print(f"Average NDCG for fold {fold + 1}: {avg_ndcg:.4f}")
        print(f"Average Lift for fold {fold + 1}: {avg_lift:.4f}")
        fold_ndcg_scores.append(avg_ndcg)

    # After all folds
    overall_avg_ndcg = np.mean(fold_ndcg_scores)
    print(f"\nOverall Average NDCG across all folds: {overall_avg_ndcg:.4f}")

    # Store embeddings and predictions in outdata
    for idx, record in enumerate(outdata):
        record['embedding'] = X_embeddings[idx].tolist()
        # Map predictions to labels
        pred = all_preds[idx]
        label_pred_dict = {id2label[i]: float(pred[i]) for i in range(len(pred))}
        record['predictions'] = label_pred_dict

    # Save enriched data
    with open("enriched_data.json", "w") as f:
        for row in outdata:
            _ = f.write(json.dumps(row) + '\n')

    # Save full model trained on entire dataset
    # Re-initialize the model
    model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
    model.to(device)

    # Create the dataset with all data
    train_dataset = EmbeddingDataset(train_embeddings, train_labels)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./final_model",
        num_train_epochs=10,  # Adjust as needed
        per_device_train_batch_size=8,
        logging_dir="./logs_final",
        evaluation_strategy="no",
        save_strategy="no",
        disable_tqdm=False,
    )

    # Initialize the Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    # Train the model on the entire dataset
    trainer.train()

    # Save the model
    model_save_path = 'multioutput_regressor.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

    # Optional: Demonstrate loading and using the model
    load_and_predict(embedder, hidden_size, num_outputs, device)

def load_and_predict(embedder, hidden_size, num_outputs, device):
    """
    Load the saved model and label mappings, make predictions on new data,
    and map the predictions to labels.
    """
    # Load the label mappings
    with open('label_mappings.json', 'r') as f:
        mappings = json.load(f)
    id2label = mappings['id2label']

    # Load the model
    model_save_path = 'multioutput_regressor.pth'
    loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
    loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
    loaded_model.to(device)
    loaded_model.eval()

    # Prepare new data for prediction
    new_sentences = [
        "This is a test sentence.",
        "Another example of a sentence to predict."
    ]
    # Generate embeddings for new sentences
    new_embeddings = embedder.generate_embeddings(new_sentences)
    new_embeddings_tensor = torch.tensor(new_embeddings, dtype=torch.float).to(device)

    # Make predictions
    with torch.no_grad():
        predictions = loaded_model(new_embeddings_tensor)
    predictions = predictions.cpu().numpy()

    # Map predictions to labels
    for sentence, pred in zip(new_sentences, predictions):
        label_pred_dict = {id2label[str(i)]: float(pred[i]) for i in range(len(pred))}
        print(f"Sentence: {sentence}")
        print("Predictions:")
        for label, value in label_pred_dict.items():
            print(f"  {label}: {value}")
        print()

if __name__ == "__main__":
    main()

    # loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
    # loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
    # loaded_model.to(device)
    # loaded_model.eval()