dgaff's picture
init model
5ff507b
# train.py
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold
from transformers import Trainer, TrainingArguments
from sklearn.metrics import ndcg_score
import json
from data_processing import load_data, EmbeddingGenerator, prepare_labels
from utils import compute_ndcg
import numpy as np
from sklearn.metrics import ndcg_score, mean_squared_error
# Generate random predictions based on label distribution
def generate_random_predictions(y_true):
return np.random.uniform(y_true.min(), y_true.max(), size=y_true.shape)
# Evaluate relative lift
def calculate_relative_lift(y_true, model_predictions, metric="ndcg"):
random_predictions = generate_random_predictions(y_true)
if metric == "ndcg":
model_score = ndcg_score([y_true], [model_predictions])
random_score = ndcg_score([y_true], [random_predictions])
lift = (model_score - random_score) / random_score
elif metric == "mse":
model_score = mean_squared_error(y_true, model_predictions)
random_score = mean_squared_error(y_true, random_predictions)
lift = (random_score - model_score) / random_score
else:
raise ValueError("Unsupported metric")
return lift, model_score, random_score
# Define your model architecture
class MultiOutputRegressor(nn.Module):
def __init__(self, hidden_size, num_outputs):
super(MultiOutputRegressor, self).__init__()
self.regressor_head = nn.Linear(hidden_size, num_outputs)
def forward(self, input_ids):
return self.regressor_head(input_ids)
# Dataset class
class EmbeddingDataset(torch.utils.data.Dataset):
def __init__(self, embeddings, labels):
self.embeddings = embeddings
self.labels = labels
def __len__(self):
return len(self.embeddings)
def __getitem__(self, idx):
return {"input_ids": self.embeddings[idx], "label": self.labels[idx]}
# Custom data collator
class CustomDataCollator:
def __call__(self, features):
embeddings = torch.stack([item["input_ids"] for item in features])
labels = torch.stack([item["label"] for item in features])
batch_data = {"input_ids": embeddings, "label": labels}
return batch_data
# Custom Trainer
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
input_ids = inputs["input_ids"].to(self.args.device)
labels = inputs["label"].to(self.args.device)
outputs = model(input_ids)
loss_fct = nn.MSELoss()
loss = loss_fct(outputs, labels)
return (loss, outputs) if return_outputs else loss
def main():
# Load data
outdata = load_data("labeled_users.json")
# Extract descriptions
descriptions = [record['description'] for record in outdata]
# Generate embeddings
embedder = EmbeddingGenerator()
X_embeddings = embedder.generate_embeddings(descriptions)
# Prepare labels
y_matrix, label2id, id2label = prepare_labels(outdata)
# Save label mappings for later use
mappings = {'label2id': label2id, 'id2label': id2label}
with open('label_mappings.json', 'w') as f:
json.dump(mappings, f)
# K-Fold Cross Validation
train_embeddings = torch.tensor(X_embeddings, dtype=torch.float)
train_labels = torch.tensor(y_matrix, dtype=torch.float)
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_collator = CustomDataCollator()
n_splits = 5 # Number of folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
hidden_size = train_embeddings.shape[1]
num_outputs = train_labels.shape[1]
fold_ndcg_scores = []
all_preds = []
for fold, (train_index, val_index) in enumerate(kf.split(train_embeddings)):
print(f"Fold {fold + 1}/{n_splits}")
# Split data into training and validation sets
X_train_fold = train_embeddings[train_index]
y_train_fold = train_labels[train_index]
X_val_fold = train_embeddings[val_index]
y_val_fold = train_labels[val_index]
# Create datasets
train_dataset = EmbeddingDataset(X_train_fold, y_train_fold)
val_dataset = EmbeddingDataset(X_val_fold, y_val_fold)
# Initialize the model
model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
model.to(device)
# Training arguments
training_args = TrainingArguments(
output_dir=f"./results_fold_{fold+1}",
num_train_epochs=10,
per_device_train_batch_size=64,
logging_dir=f"./logs_fold_{fold+1}",
evaluation_strategy="no", # No evaluation during training
save_strategy="no", # Not saving checkpoints
disable_tqdm=True, # Disable progress bar
learning_rate=1e-5,
weight_decay=0.01, # Apply a small weight decay
max_grad_norm=1.0 # Clip gradients to 1.0
)
# Initialize Trainer
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator,
)
# Train the model
trainer.train()
# Evaluate the model on the validation set
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
fold_preds = []
fold_labels = []
model.eval()
with torch.no_grad():
for batch in val_dataloader:
input_ids = batch["input_ids"].to(device)
labels = batch["label"].to(device)
outputs = model(input_ids)
fold_preds.append(outputs.cpu().numpy())
fold_labels.append(labels.cpu().numpy())
# Concatenate all predictions and labels for the fold
y_pred = np.concatenate(fold_preds, axis=0)
y_true = np.concatenate(fold_labels, axis=0)
# Append fold predictions to all_preds
all_preds.extend(y_pred)
# Compute NDCG scores
all_ndcgs = []
lifts = []
for i in range(len(y_true)):
actual_weights = y_true[i]
predicted_weights = y_pred[i]
ndcg = ndcg_score([actual_weights], [predicted_weights])
lift, model_score, random_score = calculate_relative_lift(actual_weights, predicted_weights, metric="ndcg")
lifts.append(lift)
all_ndcgs.append(ndcg)
# Average NDCG score for the current fold
if all_ndcgs:
avg_ndcg = np.mean(all_ndcgs)
else:
avg_ndcg = 0.0 # Handle cases where there are no non-zero weights
if lifts:
avg_lift = np.mean(lifts)
else:
avg_lift = 0.0 # Handle cases where there are no non-zero weights
print(f"Average NDCG for fold {fold + 1}: {avg_ndcg:.4f}")
print(f"Average Lift for fold {fold + 1}: {avg_lift:.4f}")
fold_ndcg_scores.append(avg_ndcg)
# After all folds
overall_avg_ndcg = np.mean(fold_ndcg_scores)
print(f"\nOverall Average NDCG across all folds: {overall_avg_ndcg:.4f}")
# Store embeddings and predictions in outdata
for idx, record in enumerate(outdata):
record['embedding'] = X_embeddings[idx].tolist()
# Map predictions to labels
pred = all_preds[idx]
label_pred_dict = {id2label[i]: float(pred[i]) for i in range(len(pred))}
record['predictions'] = label_pred_dict
# Save enriched data
with open("enriched_data.json", "w") as f:
for row in outdata:
_ = f.write(json.dumps(row) + '\n')
# Save full model trained on entire dataset
# Re-initialize the model
model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
model.to(device)
# Create the dataset with all data
train_dataset = EmbeddingDataset(train_embeddings, train_labels)
# Training arguments
training_args = TrainingArguments(
output_dir="./final_model",
num_train_epochs=10, # Adjust as needed
per_device_train_batch_size=8,
logging_dir="./logs_final",
evaluation_strategy="no",
save_strategy="no",
disable_tqdm=False,
)
# Initialize the Trainer
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator,
)
# Train the model on the entire dataset
trainer.train()
# Save the model
model_save_path = 'multioutput_regressor.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")
# Optional: Demonstrate loading and using the model
load_and_predict(embedder, hidden_size, num_outputs, device)
def load_and_predict(embedder, hidden_size, num_outputs, device):
"""
Load the saved model and label mappings, make predictions on new data,
and map the predictions to labels.
"""
# Load the label mappings
with open('label_mappings.json', 'r') as f:
mappings = json.load(f)
id2label = mappings['id2label']
# Load the model
model_save_path = 'multioutput_regressor.pth'
loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
loaded_model.to(device)
loaded_model.eval()
# Prepare new data for prediction
new_sentences = [
"This is a test sentence.",
"Another example of a sentence to predict."
]
# Generate embeddings for new sentences
new_embeddings = embedder.generate_embeddings(new_sentences)
new_embeddings_tensor = torch.tensor(new_embeddings, dtype=torch.float).to(device)
# Make predictions
with torch.no_grad():
predictions = loaded_model(new_embeddings_tensor)
predictions = predictions.cpu().numpy()
# Map predictions to labels
for sentence, pred in zip(new_sentences, predictions):
label_pred_dict = {id2label[str(i)]: float(pred[i]) for i in range(len(pred))}
print(f"Sentence: {sentence}")
print("Predictions:")
for label, value in label_pred_dict.items():
print(f" {label}: {value}")
print()
if __name__ == "__main__":
main()
# loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
# loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
# loaded_model.to(device)
# loaded_model.eval()