dgaff
/

bsky_user_classifier

Model card Files Files and versions Community

dgaff commited on Nov 15, 2024

Commit

5ff507b

1 Parent(s): 08cb742

init model

Browse files

Files changed (8) hide show

LICENSE +21 -0
data_processing.py +55 -0
inference.py +48 -0
label_mappings.json +1 -0
multioutput_regressor.pth +3 -0
requirements.txt +7 -0
train.py +301 -0
utils.py +15 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Devin Gaffney
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data_processing.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# data_processing.py
+import json
+import torch
+from transformers import DistilBertTokenizerFast, DistilBertModel
+import numpy as np
+def load_data(file_path):
+    with open(file_path, 'r') as f:
+        dataset = json.load(f)
+    outdata = [
+        {
+            "did": e["user_id"],
+            "description": e["description"],
+            "label_weights": e["user_categories"]
+        }
+        for e in dataset
+        if e["description"] and e["user_categories"]
+    ]
+    return outdata
+def prepare_labels(outdata):
+    all_labels = sorted({label for record in outdata for label in record['label_weights'].keys()})
+    label2id = {label: i for i, label in enumerate(all_labels)}
+    id2label = {i: label for label, i in label2id.items()}
+    y_matrix = np.zeros((len(outdata), len(all_labels)), dtype=float)
+    for idx, record in enumerate(outdata):
+        for label, weight in record['label_weights'].items():
+            y_matrix[idx, label2id[label]] = weight
+    return y_matrix, label2id, id2label
+class EmbeddingGenerator:
+    def __init__(self, model_name='distilbert-base-uncased', device=None):
+        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
+        self.embedding_model = DistilBertModel.from_pretrained(model_name)
+        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.embedding_model.to(self.device)
+    def generate_embeddings(self, descriptions, batch_size=1000):
+        all_embeddings = []
+        descriptions = [desc for desc in descriptions]
+        for i in range(0, len(descriptions), batch_size):
+            batch_descriptions = descriptions[i:i + batch_size]
+            inputs = self.tokenizer(
+                batch_descriptions,
+                padding=True,
+                truncation=True,
+                max_length=128,
+                return_tensors="pt"
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.embedding_model(**inputs)
+            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+            all_embeddings.append(batch_embeddings)
+        return np.vstack(all_embeddings)

inference.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# inference.py
+import torch
+import numpy as np
+import joblib
+import json
+from transformers import DistilBertTokenizerFast, DistilBertModel
+class Predictor:
+    def __init__(self, model_path='xgboost_model.joblib', mappings_path='label_mappings.json', device=None):
+        # Load the XGBoost model
+        self.model = joblib.load(model_path)
+        # Load label mappings
+        with open(mappings_path, 'r') as f:
+            mappings = json.load(f)
+        self.id2label = {int(k): v for k, v in mappings['id2label'].items()}
+        # Load the tokenizer and embedding model
+        self.tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+        self.embedding_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.embedding_model.to(self.device)
+    def generate_embedding(self, text):
+        inputs = self.tokenizer(
+            [text],
+            padding=True,
+            truncation=True,
+            max_length=128,
+            return_tensors="pt"
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.embedding_model(**inputs)
+        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+        return embedding
+    def predict(self, text):
+        embedding = self.generate_embedding(text)
+        y_pred = self.model.predict(embedding)
+        predictions = {self.id2label[i]: float(y_pred[0][i]) for i in range(len(self.id2label))}
+        return predictions
+# Example usage
+if __name__ == "__main__":
+    predictor = Predictor()
+    text = "I write about American politics"
+    predictions = predictor.predict(text)
+    print(predictions)

label_mappings.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"label2id": {"Animals": 0, "Art": 1, "Books": 2, "Comedy": 3, "Comics": 4, "Culture": 5, "Education": 6, "Food": 7, "Journalism": 8, "Movies": 9, "Music": 10, "Nature": 11, "News": 12, "Pets": 13, "Photography": 14, "Politics": 15, "Science": 16, "Software Dev": 17, "Sports": 18, "TV": 19, "Tech": 20, "Video Games": 21, "Writers": 22}, "id2label": {"0": "Animals", "1": "Art", "2": "Books", "3": "Comedy", "4": "Comics", "5": "Culture", "6": "Education", "7": "Food", "8": "Journalism", "9": "Movies", "10": "Music", "11": "Nature", "12": "News", "13": "Pets", "14": "Photography", "15": "Politics", "16": "Science", "17": "Software Dev", "18": "Sports", "19": "TV", "20": "Tech", "21": "Video Games", "22": "Writers"}}

multioutput_regressor.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1a318d1aaf0f83962c6acb25834ccae74413b7686c2622257f91df42f99781d
+size 72364

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+numpy
+pandas
+torch
+transformers
+scikit-learn
+xgboost
+joblib

train.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# train.py
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from sklearn.model_selection import KFold
+from transformers import Trainer, TrainingArguments
+from sklearn.metrics import ndcg_score
+import json
+from data_processing import load_data, EmbeddingGenerator, prepare_labels
+from utils import compute_ndcg
+import numpy as np
+from sklearn.metrics import ndcg_score, mean_squared_error
+# Generate random predictions based on label distribution
+def generate_random_predictions(y_true):
+    return np.random.uniform(y_true.min(), y_true.max(), size=y_true.shape)
+# Evaluate relative lift
+def calculate_relative_lift(y_true, model_predictions, metric="ndcg"):
+    random_predictions = generate_random_predictions(y_true)
+    if metric == "ndcg":
+        model_score = ndcg_score([y_true], [model_predictions])
+        random_score = ndcg_score([y_true], [random_predictions])
+        lift = (model_score - random_score) / random_score
+    elif metric == "mse":
+        model_score = mean_squared_error(y_true, model_predictions)
+        random_score = mean_squared_error(y_true, random_predictions)
+        lift = (random_score - model_score) / random_score
+    else:
+        raise ValueError("Unsupported metric")
+    return lift, model_score, random_score
+# Define your model architecture
+class MultiOutputRegressor(nn.Module):
+    def __init__(self, hidden_size, num_outputs):
+        super(MultiOutputRegressor, self).__init__()
+        self.regressor_head = nn.Linear(hidden_size, num_outputs)
+    def forward(self, input_ids):
+        return self.regressor_head(input_ids)
+# Dataset class
+class EmbeddingDataset(torch.utils.data.Dataset):
+    def __init__(self, embeddings, labels):
+        self.embeddings = embeddings
+        self.labels = labels
+    def __len__(self):
+        return len(self.embeddings)
+    def __getitem__(self, idx):
+        return {"input_ids": self.embeddings[idx], "label": self.labels[idx]}
+# Custom data collator
+class CustomDataCollator:
+    def __call__(self, features):
+        embeddings = torch.stack([item["input_ids"] for item in features])
+        labels = torch.stack([item["label"] for item in features])
+        batch_data = {"input_ids": embeddings, "label": labels}
+        return batch_data
+# Custom Trainer
+class CustomTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+        input_ids = inputs["input_ids"].to(self.args.device)
+        labels = inputs["label"].to(self.args.device)
+        outputs = model(input_ids)
+        loss_fct = nn.MSELoss()
+        loss = loss_fct(outputs, labels)
+        return (loss, outputs) if return_outputs else loss
+def main():
+    # Load data
+    outdata = load_data("labeled_users.json")
+    # Extract descriptions
+    descriptions = [record['description'] for record in outdata]
+    # Generate embeddings
+    embedder = EmbeddingGenerator()
+    X_embeddings = embedder.generate_embeddings(descriptions)
+    # Prepare labels
+    y_matrix, label2id, id2label = prepare_labels(outdata)
+    # Save label mappings for later use
+    mappings = {'label2id': label2id, 'id2label': id2label}
+    with open('label_mappings.json', 'w') as f:
+        json.dump(mappings, f)
+    # K-Fold Cross Validation
+    train_embeddings = torch.tensor(X_embeddings, dtype=torch.float)
+    train_labels = torch.tensor(y_matrix, dtype=torch.float)
+    # Device configuration
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    data_collator = CustomDataCollator()
+    n_splits = 5  # Number of folds
+    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
+    hidden_size = train_embeddings.shape[1]
+    num_outputs = train_labels.shape[1]
+    fold_ndcg_scores = []
+    all_preds = []
+    for fold, (train_index, val_index) in enumerate(kf.split(train_embeddings)):
+        print(f"Fold {fold + 1}/{n_splits}")
+        # Split data into training and validation sets
+        X_train_fold = train_embeddings[train_index]
+        y_train_fold = train_labels[train_index]
+        X_val_fold = train_embeddings[val_index]
+        y_val_fold = train_labels[val_index]
+        # Create datasets
+        train_dataset = EmbeddingDataset(X_train_fold, y_train_fold)
+        val_dataset = EmbeddingDataset(X_val_fold, y_val_fold)
+        # Initialize the model
+        model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
+        model.to(device)
+        # Training arguments
+        training_args = TrainingArguments(
+            output_dir=f"./results_fold_{fold+1}",
+            num_train_epochs=10,
+            per_device_train_batch_size=64,
+            logging_dir=f"./logs_fold_{fold+1}",
+            evaluation_strategy="no",  # No evaluation during training
+            save_strategy="no",  # Not saving checkpoints
+            disable_tqdm=True,  # Disable progress bar
+            learning_rate=1e-5,
+            weight_decay=0.01,  # Apply a small weight decay
+            max_grad_norm=1.0  # Clip gradients to 1.0
+        )
+        # Initialize Trainer
+        trainer = CustomTrainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            data_collator=data_collator,
+        )
+        # Train the model
+        trainer.train()
+        # Evaluate the model on the validation set
+        val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
+        fold_preds = []
+        fold_labels = []
+        model.eval()
+        with torch.no_grad():
+            for batch in val_dataloader:
+                input_ids = batch["input_ids"].to(device)
+                labels = batch["label"].to(device)
+                outputs = model(input_ids)
+                fold_preds.append(outputs.cpu().numpy())
+                fold_labels.append(labels.cpu().numpy())
+        # Concatenate all predictions and labels for the fold
+        y_pred = np.concatenate(fold_preds, axis=0)
+        y_true = np.concatenate(fold_labels, axis=0)
+        # Append fold predictions to all_preds
+        all_preds.extend(y_pred)
+        # Compute NDCG scores
+        all_ndcgs = []
+        lifts = []
+        for i in range(len(y_true)):
+            actual_weights = y_true[i]
+            predicted_weights = y_pred[i]
+            ndcg = ndcg_score([actual_weights], [predicted_weights])
+            lift, model_score, random_score = calculate_relative_lift(actual_weights, predicted_weights, metric="ndcg")
+            lifts.append(lift)
+            all_ndcgs.append(ndcg)
+        # Average NDCG score for the current fold
+        if all_ndcgs:
+            avg_ndcg = np.mean(all_ndcgs)
+        else:
+            avg_ndcg = 0.0  # Handle cases where there are no non-zero weights
+        if lifts:
+            avg_lift = np.mean(lifts)
+        else:
+            avg_lift = 0.0  # Handle cases where there are no non-zero weights
+        print(f"Average NDCG for fold {fold + 1}: {avg_ndcg:.4f}")
+        print(f"Average Lift for fold {fold + 1}: {avg_lift:.4f}")
+        fold_ndcg_scores.append(avg_ndcg)
+    # After all folds
+    overall_avg_ndcg = np.mean(fold_ndcg_scores)
+    print(f"\nOverall Average NDCG across all folds: {overall_avg_ndcg:.4f}")
+    # Store embeddings and predictions in outdata
+    for idx, record in enumerate(outdata):
+        record['embedding'] = X_embeddings[idx].tolist()
+        # Map predictions to labels
+        pred = all_preds[idx]
+        label_pred_dict = {id2label[i]: float(pred[i]) for i in range(len(pred))}
+        record['predictions'] = label_pred_dict
+    # Save enriched data
+    with open("enriched_data.json", "w") as f:
+        for row in outdata:
+            _ = f.write(json.dumps(row) + '\n')
+    # Save full model trained on entire dataset
+    # Re-initialize the model
+    model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
+    model.to(device)
+    # Create the dataset with all data
+    train_dataset = EmbeddingDataset(train_embeddings, train_labels)
+    # Training arguments
+    training_args = TrainingArguments(
+        output_dir="./final_model",
+        num_train_epochs=10,  # Adjust as needed
+        per_device_train_batch_size=8,
+        logging_dir="./logs_final",
+        evaluation_strategy="no",
+        save_strategy="no",
+        disable_tqdm=False,
+    )
+    # Initialize the Trainer
+    trainer = CustomTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=data_collator,
+    )
+    # Train the model on the entire dataset
+    trainer.train()
+    # Save the model
+    model_save_path = 'multioutput_regressor.pth'
+    torch.save(model.state_dict(), model_save_path)
+    print(f"Model saved to {model_save_path}")
+    # Optional: Demonstrate loading and using the model
+    load_and_predict(embedder, hidden_size, num_outputs, device)
+def load_and_predict(embedder, hidden_size, num_outputs, device):
+    """
+    Load the saved model and label mappings, make predictions on new data,
+    and map the predictions to labels.
+    """
+    # Load the label mappings
+    with open('label_mappings.json', 'r') as f:
+        mappings = json.load(f)
+    id2label = mappings['id2label']
+    # Load the model
+    model_save_path = 'multioutput_regressor.pth'
+    loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
+    loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
+    loaded_model.to(device)
+    loaded_model.eval()
+    # Prepare new data for prediction
+    new_sentences = [
+        "This is a test sentence.",
+        "Another example of a sentence to predict."
+    ]
+    # Generate embeddings for new sentences
+    new_embeddings = embedder.generate_embeddings(new_sentences)
+    new_embeddings_tensor = torch.tensor(new_embeddings, dtype=torch.float).to(device)
+    # Make predictions
+    with torch.no_grad():
+        predictions = loaded_model(new_embeddings_tensor)
+    predictions = predictions.cpu().numpy()
+    # Map predictions to labels
+    for sentence, pred in zip(new_sentences, predictions):
+        label_pred_dict = {id2label[str(i)]: float(pred[i]) for i in range(len(pred))}
+        print(f"Sentence: {sentence}")
+        print("Predictions:")
+        for label, value in label_pred_dict.items():
+            print(f"  {label}: {value}")
+        print()
+if __name__ == "__main__":
+    main()
+    # loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs)
+    # loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))
+    # loaded_model.to(device)
+    # loaded_model.eval()

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import numpy as np
+def compute_dcg(relevances):
+    relevances = np.asarray(relevances)
+    discounts = np.log2(np.arange(len(relevances)) + 2)
+    return np.sum(relevances / discounts)
+def compute_ndcg(actual_relevances, predicted_relevances, k=None):
+    order = np.argsort(-predicted_relevances)
+    actual_relevances = actual_relevances[order]
+    if k is not None:
+        actual_relevances = actual_relevances[:k]
+    dcg = compute_dcg(actual_relevances)
+    idcg = compute_dcg(np.sort(actual_relevances)[::-1])
+    return dcg / idcg if idcg > 0 else 0