|
|
|
|
|
import numpy as np |
|
import torch |
|
from torch import nn |
|
from torch.utils.data import DataLoader |
|
from sklearn.model_selection import KFold |
|
from transformers import Trainer, TrainingArguments |
|
from sklearn.metrics import ndcg_score |
|
import json |
|
|
|
from data_processing import load_data, EmbeddingGenerator, prepare_labels |
|
from utils import compute_ndcg |
|
import numpy as np |
|
from sklearn.metrics import ndcg_score, mean_squared_error |
|
|
|
|
|
def generate_random_predictions(y_true): |
|
return np.random.uniform(y_true.min(), y_true.max(), size=y_true.shape) |
|
|
|
|
|
def calculate_relative_lift(y_true, model_predictions, metric="ndcg"): |
|
random_predictions = generate_random_predictions(y_true) |
|
|
|
if metric == "ndcg": |
|
model_score = ndcg_score([y_true], [model_predictions]) |
|
random_score = ndcg_score([y_true], [random_predictions]) |
|
lift = (model_score - random_score) / random_score |
|
elif metric == "mse": |
|
model_score = mean_squared_error(y_true, model_predictions) |
|
random_score = mean_squared_error(y_true, random_predictions) |
|
lift = (random_score - model_score) / random_score |
|
else: |
|
raise ValueError("Unsupported metric") |
|
|
|
return lift, model_score, random_score |
|
|
|
|
|
class MultiOutputRegressor(nn.Module): |
|
def __init__(self, hidden_size, num_outputs): |
|
super(MultiOutputRegressor, self).__init__() |
|
self.regressor_head = nn.Linear(hidden_size, num_outputs) |
|
|
|
def forward(self, input_ids): |
|
return self.regressor_head(input_ids) |
|
|
|
|
|
class EmbeddingDataset(torch.utils.data.Dataset): |
|
def __init__(self, embeddings, labels): |
|
self.embeddings = embeddings |
|
self.labels = labels |
|
|
|
def __len__(self): |
|
return len(self.embeddings) |
|
|
|
def __getitem__(self, idx): |
|
return {"input_ids": self.embeddings[idx], "label": self.labels[idx]} |
|
|
|
|
|
class CustomDataCollator: |
|
def __call__(self, features): |
|
embeddings = torch.stack([item["input_ids"] for item in features]) |
|
labels = torch.stack([item["label"] for item in features]) |
|
batch_data = {"input_ids": embeddings, "label": labels} |
|
return batch_data |
|
|
|
|
|
class CustomTrainer(Trainer): |
|
def compute_loss(self, model, inputs, return_outputs=False, **kwargs): |
|
input_ids = inputs["input_ids"].to(self.args.device) |
|
labels = inputs["label"].to(self.args.device) |
|
outputs = model(input_ids) |
|
loss_fct = nn.MSELoss() |
|
loss = loss_fct(outputs, labels) |
|
return (loss, outputs) if return_outputs else loss |
|
|
|
def main(): |
|
|
|
outdata = load_data("labeled_users.json") |
|
|
|
|
|
descriptions = [record['description'] for record in outdata] |
|
|
|
|
|
embedder = EmbeddingGenerator() |
|
X_embeddings = embedder.generate_embeddings(descriptions) |
|
|
|
|
|
y_matrix, label2id, id2label = prepare_labels(outdata) |
|
|
|
|
|
mappings = {'label2id': label2id, 'id2label': id2label} |
|
with open('label_mappings.json', 'w') as f: |
|
json.dump(mappings, f) |
|
|
|
|
|
train_embeddings = torch.tensor(X_embeddings, dtype=torch.float) |
|
train_labels = torch.tensor(y_matrix, dtype=torch.float) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
data_collator = CustomDataCollator() |
|
|
|
n_splits = 5 |
|
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) |
|
|
|
hidden_size = train_embeddings.shape[1] |
|
num_outputs = train_labels.shape[1] |
|
fold_ndcg_scores = [] |
|
all_preds = [] |
|
|
|
for fold, (train_index, val_index) in enumerate(kf.split(train_embeddings)): |
|
print(f"Fold {fold + 1}/{n_splits}") |
|
|
|
|
|
X_train_fold = train_embeddings[train_index] |
|
y_train_fold = train_labels[train_index] |
|
X_val_fold = train_embeddings[val_index] |
|
y_val_fold = train_labels[val_index] |
|
|
|
|
|
train_dataset = EmbeddingDataset(X_train_fold, y_train_fold) |
|
val_dataset = EmbeddingDataset(X_val_fold, y_val_fold) |
|
|
|
|
|
model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) |
|
model.to(device) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=f"./results_fold_{fold+1}", |
|
num_train_epochs=10, |
|
per_device_train_batch_size=64, |
|
logging_dir=f"./logs_fold_{fold+1}", |
|
evaluation_strategy="no", |
|
save_strategy="no", |
|
disable_tqdm=True, |
|
learning_rate=1e-5, |
|
weight_decay=0.01, |
|
max_grad_norm=1.0 |
|
) |
|
|
|
trainer = CustomTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
data_collator=data_collator, |
|
) |
|
|
|
trainer.train() |
|
|
|
|
|
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator) |
|
|
|
fold_preds = [] |
|
fold_labels = [] |
|
|
|
model.eval() |
|
with torch.no_grad(): |
|
for batch in val_dataloader: |
|
input_ids = batch["input_ids"].to(device) |
|
labels = batch["label"].to(device) |
|
outputs = model(input_ids) |
|
fold_preds.append(outputs.cpu().numpy()) |
|
fold_labels.append(labels.cpu().numpy()) |
|
|
|
|
|
y_pred = np.concatenate(fold_preds, axis=0) |
|
y_true = np.concatenate(fold_labels, axis=0) |
|
|
|
|
|
all_preds.extend(y_pred) |
|
|
|
|
|
all_ndcgs = [] |
|
lifts = [] |
|
for i in range(len(y_true)): |
|
actual_weights = y_true[i] |
|
predicted_weights = y_pred[i] |
|
ndcg = ndcg_score([actual_weights], [predicted_weights]) |
|
lift, model_score, random_score = calculate_relative_lift(actual_weights, predicted_weights, metric="ndcg") |
|
lifts.append(lift) |
|
all_ndcgs.append(ndcg) |
|
|
|
|
|
if all_ndcgs: |
|
avg_ndcg = np.mean(all_ndcgs) |
|
else: |
|
avg_ndcg = 0.0 |
|
if lifts: |
|
avg_lift = np.mean(lifts) |
|
else: |
|
avg_lift = 0.0 |
|
print(f"Average NDCG for fold {fold + 1}: {avg_ndcg:.4f}") |
|
print(f"Average Lift for fold {fold + 1}: {avg_lift:.4f}") |
|
fold_ndcg_scores.append(avg_ndcg) |
|
|
|
|
|
overall_avg_ndcg = np.mean(fold_ndcg_scores) |
|
print(f"\nOverall Average NDCG across all folds: {overall_avg_ndcg:.4f}") |
|
|
|
|
|
for idx, record in enumerate(outdata): |
|
record['embedding'] = X_embeddings[idx].tolist() |
|
|
|
pred = all_preds[idx] |
|
label_pred_dict = {id2label[i]: float(pred[i]) for i in range(len(pred))} |
|
record['predictions'] = label_pred_dict |
|
|
|
|
|
with open("enriched_data.json", "w") as f: |
|
for row in outdata: |
|
_ = f.write(json.dumps(row) + '\n') |
|
|
|
|
|
|
|
model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) |
|
model.to(device) |
|
|
|
|
|
train_dataset = EmbeddingDataset(train_embeddings, train_labels) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./final_model", |
|
num_train_epochs=10, |
|
per_device_train_batch_size=8, |
|
logging_dir="./logs_final", |
|
evaluation_strategy="no", |
|
save_strategy="no", |
|
disable_tqdm=False, |
|
) |
|
|
|
|
|
trainer = CustomTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
data_collator=data_collator, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model_save_path = 'multioutput_regressor.pth' |
|
torch.save(model.state_dict(), model_save_path) |
|
print(f"Model saved to {model_save_path}") |
|
|
|
|
|
load_and_predict(embedder, hidden_size, num_outputs, device) |
|
|
|
def load_and_predict(embedder, hidden_size, num_outputs, device): |
|
""" |
|
Load the saved model and label mappings, make predictions on new data, |
|
and map the predictions to labels. |
|
""" |
|
|
|
with open('label_mappings.json', 'r') as f: |
|
mappings = json.load(f) |
|
id2label = mappings['id2label'] |
|
|
|
|
|
model_save_path = 'multioutput_regressor.pth' |
|
loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) |
|
loaded_model.load_state_dict(torch.load(model_save_path, map_location=device)) |
|
loaded_model.to(device) |
|
loaded_model.eval() |
|
|
|
|
|
new_sentences = [ |
|
"This is a test sentence.", |
|
"Another example of a sentence to predict." |
|
] |
|
|
|
new_embeddings = embedder.generate_embeddings(new_sentences) |
|
new_embeddings_tensor = torch.tensor(new_embeddings, dtype=torch.float).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
predictions = loaded_model(new_embeddings_tensor) |
|
predictions = predictions.cpu().numpy() |
|
|
|
|
|
for sentence, pred in zip(new_sentences, predictions): |
|
label_pred_dict = {id2label[str(i)]: float(pred[i]) for i in range(len(pred))} |
|
print(f"Sentence: {sentence}") |
|
print("Predictions:") |
|
for label, value in label_pred_dict.items(): |
|
print(f" {label}: {value}") |
|
print() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|