bsky_user_classifier / data_processing.py
dgaff's picture
init model
5ff507b
# data_processing.py
import json
import torch
from transformers import DistilBertTokenizerFast, DistilBertModel
import numpy as np
def load_data(file_path):
with open(file_path, 'r') as f:
dataset = json.load(f)
outdata = [
{
"did": e["user_id"],
"description": e["description"],
"label_weights": e["user_categories"]
}
for e in dataset
if e["description"] and e["user_categories"]
]
return outdata
def prepare_labels(outdata):
all_labels = sorted({label for record in outdata for label in record['label_weights'].keys()})
label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for label, i in label2id.items()}
y_matrix = np.zeros((len(outdata), len(all_labels)), dtype=float)
for idx, record in enumerate(outdata):
for label, weight in record['label_weights'].items():
y_matrix[idx, label2id[label]] = weight
return y_matrix, label2id, id2label
class EmbeddingGenerator:
def __init__(self, model_name='distilbert-base-uncased', device=None):
self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
self.embedding_model = DistilBertModel.from_pretrained(model_name)
self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.embedding_model.to(self.device)
def generate_embeddings(self, descriptions, batch_size=1000):
all_embeddings = []
descriptions = [desc for desc in descriptions]
for i in range(0, len(descriptions), batch_size):
batch_descriptions = descriptions[i:i + batch_size]
inputs = self.tokenizer(
batch_descriptions,
padding=True,
truncation=True,
max_length=128,
return_tensors="pt"
).to(self.device)
with torch.no_grad():
outputs = self.embedding_model(**inputs)
batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
all_embeddings.append(batch_embeddings)
return np.vstack(all_embeddings)