Spaces:
Sleeping
Sleeping
from torch.utils.data import DataLoader | |
from .utils.data import FFTDataset, SplitDataset | |
from datasets import load_dataset | |
from .utils.train import Trainer, XGBoostTrainer | |
from .utils.models import CNNKan, KanEncoder, CNNKanFeaturesEncoder, CNNFeaturesEncoder | |
from .utils.data_utils import * | |
from huggingface_hub import login | |
import yaml | |
import datetime | |
import json | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from collections import OrderedDict | |
import xgboost as xgb | |
from tqdm import tqdm | |
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score | |
from sklearn.model_selection import train_test_split | |
import warnings | |
warnings.filterwarnings("ignore") | |
def create_dataframe(ds, save_name='train'): | |
try: | |
df = pd.read_csv(f"tasks/utils/dfs/{save_name}.csv") | |
except FileNotFoundError: | |
data = [] | |
# Iterate over the dataset | |
pbar = tqdm(enumerate(ds)) | |
for i, batch in pbar: | |
label = batch['label'] | |
features = batch['audio']['features'] | |
# Flatten the nested dictionary structure | |
feature_dict = {'label': label} | |
for k, v in features.items(): | |
if isinstance(v, dict): | |
for sub_k, sub_v in v.items(): | |
feature_dict[f"{k}_{sub_k}"] = sub_v[0].item() # Aggregate (e.g., mean) | |
data.append(feature_dict) | |
# Convert to DataFrame | |
df = pd.DataFrame(data) | |
print(os.getcwd()) | |
df.to_csv(f"tasks/utils/dfs/{save_name}.csv", index=False) | |
X = df.drop(columns=['label']) | |
y = df['label'] | |
return X, y | |
# local_rank = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
current_date = datetime.date.today().strftime("%Y-%m-%d") | |
datetime_dir = f"frugal_{current_date}" | |
args_dir = 'tasks/utils/config.yaml' | |
data_args = Container(**yaml.safe_load(open(args_dir, 'r'))['Data']) | |
exp_num = data_args.exp_num | |
model_name = data_args.model_name | |
model_args = Container(**yaml.safe_load(open(args_dir, 'r'))['CNNEncoder']) | |
mlp_args = Container(**yaml.safe_load(open(args_dir, 'r'))['MLP']) | |
model_args_f = Container(**yaml.safe_load(open(args_dir, 'r'))['CNNEncoder_f']) | |
conformer_args = Container(**yaml.safe_load(open(args_dir, 'r'))['Conformer']) | |
kan_args = Container(**yaml.safe_load(open(args_dir, 'r'))['KAN']) | |
boost_args = Container(**yaml.safe_load(open(args_dir, 'r'))['XGBoost']) | |
if not os.path.exists(f"{data_args.log_dir}/{datetime_dir}"): | |
os.makedirs(f"{data_args.log_dir}/{datetime_dir}") | |
with open("../logs//token.txt", "r") as f: | |
api_key = f.read() | |
# local_rank, world_size, gpus_per_node = setup() | |
local_rank = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
login(api_key) | |
dataset = load_dataset("rfcx/frugalai", streaming=True) | |
full_ds = FFTDataset(dataset["train"], features=True) | |
train_ds = SplitDataset(FFTDataset(dataset["train"], features=True), is_train=True) | |
train_dl = DataLoader(train_ds, batch_size=data_args.batch_size, collate_fn=collate_fn) | |
val_ds = SplitDataset(FFTDataset(dataset["train"], features=True), is_train=False) | |
val_dl = DataLoader(val_ds,batch_size=data_args.batch_size, collate_fn=collate_fn) | |
test_ds = FFTDataset(dataset["test"], features=True) | |
test_dl = DataLoader(test_ds,batch_size=data_args.batch_size, collate_fn=collate_fn) | |
x,y = create_dataframe(full_ds, save_name='train_val') | |
print(x.shape) | |
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42) | |
evals_result = {} | |
num_boost_round = 1000 # Set a large number of boosting rounds | |
# Watchlist to monitor performance on train and validation data | |
dtrain = xgb.DMatrix(x_train, label=y_train) | |
dval = xgb.DMatrix(x_val, label=y_val) | |
watchlist = [(dtrain, 'train'), (dval, 'eval')] | |
params = { | |
'objective': 'binary:logistic', | |
'eval_metric': 'logloss', | |
**boost_args.get_dict() | |
} | |
# Train the model | |
xgb_model = xgb.train( | |
params, | |
dtrain, | |
num_boost_round=num_boost_round, | |
evals=watchlist, | |
early_stopping_rounds=10, # Early stopping after 10 rounds with no improvement | |
evals_result=evals_result, | |
verbose_eval=False # Show evaluation results for each iteration | |
) | |
xgb_pred = xgb_model.predict(dval, output_margin=False) # Take probability of class 1 | |
# xgb_pred = torch.tensor(xgb_pred, dtype=torch.float32, device=x.device).unsqueeze(1) | |
y_pred = (xgb_pred >= 0.5).astype(int) | |
# Get the number of trees in the trained model | |
accuracy = accuracy_score(y_val, y_pred) | |
roc_auc = roc_auc_score(y_val, y_pred) | |
print(f'Accuracy: {accuracy:.4f}') | |
print(f'ROC AUC Score: {roc_auc:.4f}') | |
num_xgb_features = xgb_model.best_iteration + 1 | |
print(num_xgb_features) | |
# data = [] | |
# | |
# # Iterate over the dataset | |
# for i, batch in enumerate(train_ds): | |
# label = batch['label'] | |
# features = batch['audio']['features'] | |
# | |
# # Flatten the nested dictionary structure | |
# feature_dict = {'label': label} | |
# for k, v in features.items(): | |
# if isinstance(v, dict): | |
# for sub_k, sub_v in v.items(): | |
# feature_dict[f"{k}_{sub_k}"] = sub_v[0].item() # Aggregate (e.g., mean) | |
# else: | |
# print(k, v.shape) # Aggregate (e.g., mean) | |
# | |
# data.append(feature_dict) | |
# print(i) | |
# | |
# if i > 1000: # Limit to 10 iterations | |
# break | |
# | |
# # Convert to DataFrame | |
# df = pd.DataFrame(data) | |
# Plot distributions colored by label | |
# plt.figure() | |
# for col in df.columns: | |
# if col != 'label': | |
# sns.kdeplot(df, x=col, hue='label', fill=True, alpha=0.5) | |
# plt.title(f'Distribution of {col}') | |
# plt.show() | |
# exit() | |
# trainer = XGBoostTrainer(boost_args.get_dict(), train_ds, val_ds, test_ds) | |
# res = trainer.fit() | |
# trainer.predict() | |
# trainer.plot_results(res) | |
# exit() | |
# model = DualEncoder(model_args, model_args_f, conformer_args) | |
# model = FasterKAN([18000,64,64,16,1]) | |
# model = CNNKan(model_args, conformer_args, kan_args.get_dict()) | |
# model = CNNKanFeaturesEncoder(xgb_model, model_args, kan_args.get_dict()) | |
model = CNNFeaturesEncoder(xgb_model,model_args) | |
# model.kan.speed() | |
# model = KanEncoder(kan_args.get_dict()) | |
model = model.to(local_rank) | |
# state_dict = torch.load(data_args.checkpoint_path, map_location=torch.device('cpu')) | |
# new_state_dict = OrderedDict() | |
# for key, value in state_dict.items(): | |
# if key.startswith('module.'): | |
# key = key[7:] | |
# new_state_dict[key] = value | |
# missing, unexpected = model.load_state_dict(new_state_dict) | |
# model = DDP(model, device_ids=[local_rank], output_device=local_rank) | |
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
print(f"Number of parameters: {num_params}") | |
loss_fn = torch.nn.BCEWithLogitsLoss() | |
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) | |
total_steps = int(data_args.num_epochs) * 1000 | |
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, | |
T_max=total_steps, | |
eta_min=float((5e-4)/10)) | |
# missing, unexpected = model.load_state_dict(torch.load(model_args.checkpoint_path)) | |
# print(f"Missing keys: {missing}") | |
# print(f"Unexpected keys: {unexpected}") | |
trainer = Trainer(model=model, optimizer=optimizer, | |
criterion=loss_fn, output_dim=model_args.output_dim, scaler=None, | |
scheduler=None, train_dataloader=train_dl, | |
val_dataloader=val_dl, device=local_rank, | |
exp_num=datetime_dir, log_path=data_args.log_dir, | |
range_update=None, | |
accumulation_step=1, max_iter=np.inf, | |
exp_name=f"frugal_kan_features_{exp_num}") | |
fit_res = trainer.fit(num_epochs=100, device=local_rank, | |
early_stopping=10, only_p=False, best='loss', conf=True) | |
output_filename = f'{data_args.log_dir}/{datetime_dir}/{model_name}_frugal_{exp_num}.json' | |
with open(output_filename, "w") as f: | |
json.dump(fit_res, f, indent=2) | |
preds, tru, acc = trainer.predict(test_dl, local_rank) | |
print(f"Accuracy: {acc}") | |