dgaff
/

bsky_user_classifier

Model card Files Files and versions Community

bsky_user_classifier / train.py

dgaff

init model

5ff507b 2 months ago

raw

history blame contribute delete

10.7 kB

	# train.py

	import numpy as np
	import torch
	from torch import nn
	from torch.utils.data import DataLoader
	from sklearn.model_selection import KFold
	from transformers import Trainer, TrainingArguments
	from sklearn.metrics import ndcg_score
	import json

	from data_processing import load_data, EmbeddingGenerator, prepare_labels
	from utils import compute_ndcg
	import numpy as np
	from sklearn.metrics import ndcg_score, mean_squared_error

	# Generate random predictions based on label distribution
	def generate_random_predictions(y_true):
	return np.random.uniform(y_true.min(), y_true.max(), size=y_true.shape)

	# Evaluate relative lift
	def calculate_relative_lift(y_true, model_predictions, metric="ndcg"):
	random_predictions = generate_random_predictions(y_true)

	if metric == "ndcg":
	model_score = ndcg_score([y_true], [model_predictions])
	random_score = ndcg_score([y_true], [random_predictions])
	lift = (model_score - random_score) / random_score
	elif metric == "mse":
	model_score = mean_squared_error(y_true, model_predictions)
	random_score = mean_squared_error(y_true, random_predictions)
	lift = (random_score - model_score) / random_score
	else:
	raise ValueError("Unsupported metric")

	return lift, model_score, random_score

	# Define your model architecture
	class MultiOutputRegressor(nn.Module):
	def __init__(self, hidden_size, num_outputs):
	super(MultiOutputRegressor, self).__init__()
	self.regressor_head = nn.Linear(hidden_size, num_outputs)

	def forward(self, input_ids):
	return self.regressor_head(input_ids)

	# Dataset class
	class EmbeddingDataset(torch.utils.data.Dataset):
	def __init__(self, embeddings, labels):
	self.embeddings = embeddings
	self.labels = labels

	def __len__(self):
	return len(self.embeddings)

	def __getitem__(self, idx):
	return {"input_ids": self.embeddings[idx], "label": self.labels[idx]}

	# Custom data collator
	class CustomDataCollator:
	def __call__(self, features):
	embeddings = torch.stack([item["input_ids"] for item in features])
	labels = torch.stack([item["label"] for item in features])
	batch_data = {"input_ids": embeddings, "label": labels}
	return batch_data

	# Custom Trainer
	class CustomTrainer(Trainer):
	def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
	input_ids = inputs["input_ids"].to(self.args.device)
	labels = inputs["label"].to(self.args.device)
	outputs = model(input_ids)
	loss_fct = nn.MSELoss()
	loss = loss_fct(outputs, labels)
	return (loss, outputs) if return_outputs else loss

	def main():
	# Load data
	outdata = load_data("labeled_users.json")

	# Extract descriptions
	descriptions = [record['description'] for record in outdata]

	# Generate embeddings
	embedder = EmbeddingGenerator()
	X_embeddings = embedder.generate_embeddings(descriptions)

	# Prepare labels
	y_matrix, label2id, id2label = prepare_labels(outdata)

	# Save label mappings for later use
	mappings = {'label2id': label2id, 'id2label': id2label}
	with open('label_mappings.json', 'w') as f:
	json.dump(mappings, f)

	# K-Fold Cross Validation
	train_embeddings = torch.tensor(X_embeddings, dtype=torch.float)
	train_labels = torch.tensor(y_matrix, dtype=torch.float)

	# Device configuration
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	data_collator = CustomDataCollator()

	n_splits = 5 # Number of folds
	kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

	hidden_size = train_embeddings.shape[1]
	num_outputs = train_labels.shape[1]
	fold_ndcg_scores = []
	all_preds = []

	for fold, (train_index, val_index) in enumerate(kf.split(train_embeddings)):
	print(f"Fold {fold + 1}/{n_splits}")

	# Split data into training and validation sets
	X_train_fold = train_embeddings[train_index]
	y_train_fold = train_labels[train_index]
	X_val_fold = train_embeddings[val_index]
	y_val_fold = train_labels[val_index]

	# Create datasets
	train_dataset = EmbeddingDataset(X_train_fold, y_train_fold)
	val_dataset = EmbeddingDataset(X_val_fold, y_val_fold)

	# Initialize the model
	model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
	model.to(device)

	# Training arguments
	training_args = TrainingArguments(
	output_dir=f"./results_fold_{fold+1}",
	num_train_epochs=10,
	per_device_train_batch_size=64,
	logging_dir=f"./logs_fold_{fold+1}",
	evaluation_strategy="no", # No evaluation during training
	save_strategy="no", # Not saving checkpoints
	disable_tqdm=True, # Disable progress bar
	learning_rate=1e-5,
	weight_decay=0.01, # Apply a small weight decay
	max_grad_norm=1.0 # Clip gradients to 1.0
	)
	# Initialize Trainer
	trainer = CustomTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	data_collator=data_collator,
	)
	# Train the model
	trainer.train()

	# Evaluate the model on the validation set
	val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)

	fold_preds = []
	fold_labels = []

	model.eval()
	with torch.no_grad():
	for batch in val_dataloader:
	input_ids = batch["input_ids"].to(device)
	labels = batch["label"].to(device)
	outputs = model(input_ids)
	fold_preds.append(outputs.cpu().numpy())
	fold_labels.append(labels.cpu().numpy())

	# Concatenate all predictions and labels for the fold
	y_pred = np.concatenate(fold_preds, axis=0)
	y_true = np.concatenate(fold_labels, axis=0)

	# Append fold predictions to all_preds
	all_preds.extend(y_pred)

	# Compute NDCG scores
	all_ndcgs = []
	lifts = []
	for i in range(len(y_true)):
	actual_weights = y_true[i]
	predicted_weights = y_pred[i]
	ndcg = ndcg_score([actual_weights], [predicted_weights])
	lift, model_score, random_score = calculate_relative_lift(actual_weights, predicted_weights, metric="ndcg")
	lifts.append(lift)
	all_ndcgs.append(ndcg)

	# Average NDCG score for the current fold
	if all_ndcgs:
	avg_ndcg = np.mean(all_ndcgs)
	else:
	avg_ndcg = 0.0 # Handle cases where there are no non-zero weights
	if lifts:
	avg_lift = np.mean(lifts)
	else:
	avg_lift = 0.0 # Handle cases where there are no non-zero weights
	print(f"Average NDCG for fold {fold + 1}: {avg_ndcg:.4f}")
	print(f"Average Lift for fold {fold + 1}: {avg_lift:.4f}")
	fold_ndcg_scores.append(avg_ndcg)

	# After all folds
	overall_avg_ndcg = np.mean(fold_ndcg_scores)
	print(f"\nOverall Average NDCG across all folds: {overall_avg_ndcg:.4f}")

	# Store embeddings and predictions in outdata
	for idx, record in enumerate(outdata):
	record['embedding'] = X_embeddings[idx].tolist()
	# Map predictions to labels
	pred = all_preds[idx]
	label_pred_dict = {id2label[i]: float(pred[i]) for i in range(len(pred))}
	record['predictions'] = label_pred_dict

	# Save enriched data
	with open("enriched_data.json", "w") as f:
	for row in outdata:
	_ = f.write(json.dumps(row) + '\n')

	# Save full model trained on entire dataset
	# Re-initialize the model
	model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
	model.to(device)

	# Create the dataset with all data
	train_dataset = EmbeddingDataset(train_embeddings, train_labels)

	# Training arguments
	training_args = TrainingArguments(
	output_dir="./final_model",
	num_train_epochs=10, # Adjust as needed
	per_device_train_batch_size=8,
	logging_dir="./logs_final",
	evaluation_strategy="no",
	save_strategy="no",
	disable_tqdm=False,
	)

	# Initialize the Trainer
	trainer = CustomTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	data_collator=data_collator,
	)

	# Train the model on the entire dataset
	trainer.train()

	# Save the model
	model_save_path = 'multioutput_regressor.pth'
	torch.save(model.state_dict(), model_save_path)
	print(f"Model saved to {model_save_path}")

	# Optional: Demonstrate loading and using the model
	load_and_predict(embedder, hidden_size, num_outputs, device)

	def load_and_predict(embedder, hidden_size, num_outputs, device):
	"""
	Load the saved model and label mappings, make predictions on new data,
	and map the predictions to labels.
	"""
	# Load the label mappings
	with open('label_mappings.json', 'r') as f:
	mappings = json.load(f)
	id2label = mappings['id2label']

	# Load the model
	model_save_path = 'multioutput_regressor.pth'
	loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
	loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
	loaded_model.to(device)
	loaded_model.eval()

	# Prepare new data for prediction
	new_sentences = [
	"This is a test sentence.",
	"Another example of a sentence to predict."
	]
	# Generate embeddings for new sentences
	new_embeddings = embedder.generate_embeddings(new_sentences)
	new_embeddings_tensor = torch.tensor(new_embeddings, dtype=torch.float).to(device)

	# Make predictions
	with torch.no_grad():
	predictions = loaded_model(new_embeddings_tensor)
	predictions = predictions.cpu().numpy()

	# Map predictions to labels
	for sentence, pred in zip(new_sentences, predictions):
	label_pred_dict = {id2label[str(i)]: float(pred[i]) for i in range(len(pred))}
	print(f"Sentence: {sentence}")
	print("Predictions:")
	for label, value in label_pred_dict.items():
	print(f" {label}: {value}")
	print()

	if __name__ == "__main__":
	main()

	# loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
	# loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
	# loaded_model.to(device)
	# loaded_model.eval()