# train.py import numpy as np import torch from torch import nn from torch.utils.data import DataLoader from sklearn.model_selection import KFold from transformers import Trainer, TrainingArguments from sklearn.metrics import ndcg_score import json from data_processing import load_data, EmbeddingGenerator, prepare_labels from utils import compute_ndcg import numpy as np from sklearn.metrics import ndcg_score, mean_squared_error # Generate random predictions based on label distribution def generate_random_predictions(y_true): return np.random.uniform(y_true.min(), y_true.max(), size=y_true.shape) # Evaluate relative lift def calculate_relative_lift(y_true, model_predictions, metric="ndcg"): random_predictions = generate_random_predictions(y_true) if metric == "ndcg": model_score = ndcg_score([y_true], [model_predictions]) random_score = ndcg_score([y_true], [random_predictions]) lift = (model_score - random_score) / random_score elif metric == "mse": model_score = mean_squared_error(y_true, model_predictions) random_score = mean_squared_error(y_true, random_predictions) lift = (random_score - model_score) / random_score else: raise ValueError("Unsupported metric") return lift, model_score, random_score # Define your model architecture class MultiOutputRegressor(nn.Module): def __init__(self, hidden_size, num_outputs): super(MultiOutputRegressor, self).__init__() self.regressor_head = nn.Linear(hidden_size, num_outputs) def forward(self, input_ids): return self.regressor_head(input_ids) # Dataset class class EmbeddingDataset(torch.utils.data.Dataset): def __init__(self, embeddings, labels): self.embeddings = embeddings self.labels = labels def __len__(self): return len(self.embeddings) def __getitem__(self, idx): return {"input_ids": self.embeddings[idx], "label": self.labels[idx]} # Custom data collator class CustomDataCollator: def __call__(self, features): embeddings = torch.stack([item["input_ids"] for item in features]) labels = torch.stack([item["label"] for item in features]) batch_data = {"input_ids": embeddings, "label": labels} return batch_data # Custom Trainer class CustomTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False, **kwargs): input_ids = inputs["input_ids"].to(self.args.device) labels = inputs["label"].to(self.args.device) outputs = model(input_ids) loss_fct = nn.MSELoss() loss = loss_fct(outputs, labels) return (loss, outputs) if return_outputs else loss def main(): # Load data outdata = load_data("labeled_users.json") # Extract descriptions descriptions = [record['description'] for record in outdata] # Generate embeddings embedder = EmbeddingGenerator() X_embeddings = embedder.generate_embeddings(descriptions) # Prepare labels y_matrix, label2id, id2label = prepare_labels(outdata) # Save label mappings for later use mappings = {'label2id': label2id, 'id2label': id2label} with open('label_mappings.json', 'w') as f: json.dump(mappings, f) # K-Fold Cross Validation train_embeddings = torch.tensor(X_embeddings, dtype=torch.float) train_labels = torch.tensor(y_matrix, dtype=torch.float) # Device configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data_collator = CustomDataCollator() n_splits = 5 # Number of folds kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) hidden_size = train_embeddings.shape[1] num_outputs = train_labels.shape[1] fold_ndcg_scores = [] all_preds = [] for fold, (train_index, val_index) in enumerate(kf.split(train_embeddings)): print(f"Fold {fold + 1}/{n_splits}") # Split data into training and validation sets X_train_fold = train_embeddings[train_index] y_train_fold = train_labels[train_index] X_val_fold = train_embeddings[val_index] y_val_fold = train_labels[val_index] # Create datasets train_dataset = EmbeddingDataset(X_train_fold, y_train_fold) val_dataset = EmbeddingDataset(X_val_fold, y_val_fold) # Initialize the model model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) model.to(device) # Training arguments training_args = TrainingArguments( output_dir=f"./results_fold_{fold+1}", num_train_epochs=10, per_device_train_batch_size=64, logging_dir=f"./logs_fold_{fold+1}", evaluation_strategy="no", # No evaluation during training save_strategy="no", # Not saving checkpoints disable_tqdm=True, # Disable progress bar learning_rate=1e-5, weight_decay=0.01, # Apply a small weight decay max_grad_norm=1.0 # Clip gradients to 1.0 ) # Initialize Trainer trainer = CustomTrainer( model=model, args=training_args, train_dataset=train_dataset, data_collator=data_collator, ) # Train the model trainer.train() # Evaluate the model on the validation set val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator) fold_preds = [] fold_labels = [] model.eval() with torch.no_grad(): for batch in val_dataloader: input_ids = batch["input_ids"].to(device) labels = batch["label"].to(device) outputs = model(input_ids) fold_preds.append(outputs.cpu().numpy()) fold_labels.append(labels.cpu().numpy()) # Concatenate all predictions and labels for the fold y_pred = np.concatenate(fold_preds, axis=0) y_true = np.concatenate(fold_labels, axis=0) # Append fold predictions to all_preds all_preds.extend(y_pred) # Compute NDCG scores all_ndcgs = [] lifts = [] for i in range(len(y_true)): actual_weights = y_true[i] predicted_weights = y_pred[i] ndcg = ndcg_score([actual_weights], [predicted_weights]) lift, model_score, random_score = calculate_relative_lift(actual_weights, predicted_weights, metric="ndcg") lifts.append(lift) all_ndcgs.append(ndcg) # Average NDCG score for the current fold if all_ndcgs: avg_ndcg = np.mean(all_ndcgs) else: avg_ndcg = 0.0 # Handle cases where there are no non-zero weights if lifts: avg_lift = np.mean(lifts) else: avg_lift = 0.0 # Handle cases where there are no non-zero weights print(f"Average NDCG for fold {fold + 1}: {avg_ndcg:.4f}") print(f"Average Lift for fold {fold + 1}: {avg_lift:.4f}") fold_ndcg_scores.append(avg_ndcg) # After all folds overall_avg_ndcg = np.mean(fold_ndcg_scores) print(f"\nOverall Average NDCG across all folds: {overall_avg_ndcg:.4f}") # Store embeddings and predictions in outdata for idx, record in enumerate(outdata): record['embedding'] = X_embeddings[idx].tolist() # Map predictions to labels pred = all_preds[idx] label_pred_dict = {id2label[i]: float(pred[i]) for i in range(len(pred))} record['predictions'] = label_pred_dict # Save enriched data with open("enriched_data.json", "w") as f: for row in outdata: _ = f.write(json.dumps(row) + '\n') # Save full model trained on entire dataset # Re-initialize the model model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) model.to(device) # Create the dataset with all data train_dataset = EmbeddingDataset(train_embeddings, train_labels) # Training arguments training_args = TrainingArguments( output_dir="./final_model", num_train_epochs=10, # Adjust as needed per_device_train_batch_size=8, logging_dir="./logs_final", evaluation_strategy="no", save_strategy="no", disable_tqdm=False, ) # Initialize the Trainer trainer = CustomTrainer( model=model, args=training_args, train_dataset=train_dataset, data_collator=data_collator, ) # Train the model on the entire dataset trainer.train() # Save the model model_save_path = 'multioutput_regressor.pth' torch.save(model.state_dict(), model_save_path) print(f"Model saved to {model_save_path}") # Optional: Demonstrate loading and using the model load_and_predict(embedder, hidden_size, num_outputs, device) def load_and_predict(embedder, hidden_size, num_outputs, device): """ Load the saved model and label mappings, make predictions on new data, and map the predictions to labels. """ # Load the label mappings with open('label_mappings.json', 'r') as f: mappings = json.load(f) id2label = mappings['id2label'] # Load the model model_save_path = 'multioutput_regressor.pth' loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) loaded_model.load_state_dict(torch.load(model_save_path, map_location=device)) loaded_model.to(device) loaded_model.eval() # Prepare new data for prediction new_sentences = [ "This is a test sentence.", "Another example of a sentence to predict." ] # Generate embeddings for new sentences new_embeddings = embedder.generate_embeddings(new_sentences) new_embeddings_tensor = torch.tensor(new_embeddings, dtype=torch.float).to(device) # Make predictions with torch.no_grad(): predictions = loaded_model(new_embeddings_tensor) predictions = predictions.cpu().numpy() # Map predictions to labels for sentence, pred in zip(new_sentences, predictions): label_pred_dict = {id2label[str(i)]: float(pred[i]) for i in range(len(pred))} print(f"Sentence: {sentence}") print("Predictions:") for label, value in label_pred_dict.items(): print(f" {label}: {value}") print() if __name__ == "__main__": main() # loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) # loaded_model.load_state_dict(torch.load(model_save_path, map_location=device)) # loaded_model.to(device) # loaded_model.eval()