File size: 10,747 Bytes

5ff507b

# train.py

import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold
from transformers import Trainer, TrainingArguments
from sklearn.metrics import ndcg_score
import json

from data_processing import load_data, EmbeddingGenerator, prepare_labels
from utils import compute_ndcg
import numpy as np
from sklearn.metrics import ndcg_score, mean_squared_error

# Generate random predictions based on label distribution
def generate_random_predictions(y_true):
    return np.random.uniform(y_true.min(), y_true.max(), size=y_true.shape)

# Evaluate relative lift
def calculate_relative_lift(y_true, model_predictions, metric="ndcg"):
    random_predictions = generate_random_predictions(y_true)
    
    if metric == "ndcg":
        model_score = ndcg_score([y_true], [model_predictions])
        random_score = ndcg_score([y_true], [random_predictions])
        lift = (model_score - random_score) / random_score
    elif metric == "mse":
        model_score = mean_squared_error(y_true, model_predictions)
        random_score = mean_squared_error(y_true, random_predictions)
        lift = (random_score - model_score) / random_score
    else:
        raise ValueError("Unsupported metric")
    
    return lift, model_score, random_score

# Define your model architecture
class MultiOutputRegressor(nn.Module):
    def __init__(self, hidden_size, num_outputs):
        super(MultiOutputRegressor, self).__init__()
        self.regressor_head = nn.Linear(hidden_size, num_outputs)
    
    def forward(self, input_ids):
        return self.regressor_head(input_ids)

# Dataset class
class EmbeddingDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return {"input_ids": self.embeddings[idx], "label": self.labels[idx]}

# Custom data collator
class CustomDataCollator:
    def __call__(self, features):
        embeddings = torch.stack([item["input_ids"] for item in features])
        labels = torch.stack([item["label"] for item in features])
        batch_data = {"input_ids": embeddings, "label": labels}
        return batch_data

# Custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        input_ids = inputs["input_ids"].to(self.args.device)
        labels = inputs["label"].to(self.args.device)
        outputs = model(input_ids)
        loss_fct = nn.MSELoss()
        loss = loss_fct(outputs, labels)
        return (loss, outputs) if return_outputs else loss

def main():
    # Load data
    outdata = load_data("labeled_users.json")

    # Extract descriptions
    descriptions = [record['description'] for record in outdata]

    # Generate embeddings
    embedder = EmbeddingGenerator()
    X_embeddings = embedder.generate_embeddings(descriptions)

    # Prepare labels
    y_matrix, label2id, id2label = prepare_labels(outdata)

    # Save label mappings for later use
    mappings = {'label2id': label2id, 'id2label': id2label}
    with open('label_mappings.json', 'w') as f:
        json.dump(mappings, f)

    # K-Fold Cross Validation
    train_embeddings = torch.tensor(X_embeddings, dtype=torch.float)
    train_labels = torch.tensor(y_matrix, dtype=torch.float)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    data_collator = CustomDataCollator()

    n_splits = 5  # Number of folds
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    hidden_size = train_embeddings.shape[1]
    num_outputs = train_labels.shape[1]
    fold_ndcg_scores = []
    all_preds = []

    for fold, (train_index, val_index) in enumerate(kf.split(train_embeddings)):
        print(f"Fold {fold + 1}/{n_splits}")
    
        # Split data into training and validation sets
        X_train_fold = train_embeddings[train_index]
        y_train_fold = train_labels[train_index]
        X_val_fold = train_embeddings[val_index]
        y_val_fold = train_labels[val_index]
    
        # Create datasets
        train_dataset = EmbeddingDataset(X_train_fold, y_train_fold)
        val_dataset = EmbeddingDataset(X_val_fold, y_val_fold)
    
        # Initialize the model
        model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
        model.to(device)
    
        # Training arguments
        training_args = TrainingArguments(
            output_dir=f"./results_fold_{fold+1}",
            num_train_epochs=10,
            per_device_train_batch_size=64,
            logging_dir=f"./logs_fold_{fold+1}",
            evaluation_strategy="no",  # No evaluation during training
            save_strategy="no",  # Not saving checkpoints
            disable_tqdm=True,  # Disable progress bar
            learning_rate=1e-5,
            weight_decay=0.01,  # Apply a small weight decay
            max_grad_norm=1.0  # Clip gradients to 1.0
        )
        # Initialize Trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=data_collator,
        )
        # Train the model
        trainer.train()
    
        # Evaluate the model on the validation set
        val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
    
        fold_preds = []
        fold_labels = []
    
        model.eval()
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["label"].to(device)
                outputs = model(input_ids)
                fold_preds.append(outputs.cpu().numpy())
                fold_labels.append(labels.cpu().numpy())
    
        # Concatenate all predictions and labels for the fold
        y_pred = np.concatenate(fold_preds, axis=0)
        y_true = np.concatenate(fold_labels, axis=0)
    
        # Append fold predictions to all_preds
        all_preds.extend(y_pred)
    
        # Compute NDCG scores
        all_ndcgs = []
        lifts = []
        for i in range(len(y_true)):
            actual_weights = y_true[i]
            predicted_weights = y_pred[i]
            ndcg = ndcg_score([actual_weights], [predicted_weights])
            lift, model_score, random_score = calculate_relative_lift(actual_weights, predicted_weights, metric="ndcg")
            lifts.append(lift)
            all_ndcgs.append(ndcg)
    
        # Average NDCG score for the current fold
        if all_ndcgs:
            avg_ndcg = np.mean(all_ndcgs)
        else:
            avg_ndcg = 0.0  # Handle cases where there are no non-zero weights
        if lifts:
            avg_lift = np.mean(lifts)
        else:
            avg_lift = 0.0  # Handle cases where there are no non-zero weights
        print(f"Average NDCG for fold {fold + 1}: {avg_ndcg:.4f}")
        print(f"Average Lift for fold {fold + 1}: {avg_lift:.4f}")
        fold_ndcg_scores.append(avg_ndcg)

    # After all folds
    overall_avg_ndcg = np.mean(fold_ndcg_scores)
    print(f"\nOverall Average NDCG across all folds: {overall_avg_ndcg:.4f}")

    # Store embeddings and predictions in outdata
    for idx, record in enumerate(outdata):
        record['embedding'] = X_embeddings[idx].tolist()
        # Map predictions to labels
        pred = all_preds[idx]
        label_pred_dict = {id2label[i]: float(pred[i]) for i in range(len(pred))}
        record['predictions'] = label_pred_dict

    # Save enriched data
    with open("enriched_data.json", "w") as f:
        for row in outdata:
            _ = f.write(json.dumps(row) + '\n')

    # Save full model trained on entire dataset
    # Re-initialize the model
    model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
    model.to(device)

    # Create the dataset with all data
    train_dataset = EmbeddingDataset(train_embeddings, train_labels)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./final_model",
        num_train_epochs=10,  # Adjust as needed
        per_device_train_batch_size=8,
        logging_dir="./logs_final",
        evaluation_strategy="no",
        save_strategy="no",
        disable_tqdm=False,
    )

    # Initialize the Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    # Train the model on the entire dataset
    trainer.train()

    # Save the model
    model_save_path = 'multioutput_regressor.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

    # Optional: Demonstrate loading and using the model
    load_and_predict(embedder, hidden_size, num_outputs, device)

def load_and_predict(embedder, hidden_size, num_outputs, device):
    """
    Load the saved model and label mappings, make predictions on new data,
    and map the predictions to labels.
    """
    # Load the label mappings
    with open('label_mappings.json', 'r') as f:
        mappings = json.load(f)
    id2label = mappings['id2label']

    # Load the model
    model_save_path = 'multioutput_regressor.pth'
    loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
    loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
    loaded_model.to(device)
    loaded_model.eval()

    # Prepare new data for prediction
    new_sentences = [
        "This is a test sentence.",
        "Another example of a sentence to predict."
    ]
    # Generate embeddings for new sentences
    new_embeddings = embedder.generate_embeddings(new_sentences)
    new_embeddings_tensor = torch.tensor(new_embeddings, dtype=torch.float).to(device)

    # Make predictions
    with torch.no_grad():
        predictions = loaded_model(new_embeddings_tensor)
    predictions = predictions.cpu().numpy()

    # Map predictions to labels
    for sentence, pred in zip(new_sentences, predictions):
        label_pred_dict = {id2label[str(i)]: float(pred[i]) for i in range(len(pred))}
        print(f"Sentence: {sentence}")
        print("Predictions:")
        for label, value in label_pred_dict.items():
            print(f"  {label}: {value}")
        print()

if __name__ == "__main__":
    main()

    # loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
    # loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
    # loaded_model.to(device)
    # loaded_model.eval()