|
from sklearn.metrics import roc_auc_score, roc_curve |
|
|
|
import datetime |
|
import os |
|
import umap |
|
import numpy as np |
|
|
|
import matplotlib.pyplot as plt |
|
import pandas as pd |
|
import pickle |
|
import json |
|
|
|
from xgboost import XGBClassifier, XGBRegressor |
|
import xgboost as xgb |
|
from sklearn.metrics import roc_auc_score, mean_squared_error |
|
import xgboost as xgb |
|
from sklearn.svm import SVR |
|
from sklearn.linear_model import LinearRegression |
|
from sklearn.kernel_ridge import KernelRidge |
|
import json |
|
from sklearn.compose import TransformedTargetRegressor |
|
from sklearn.preprocessing import MinMaxScaler |
|
|
|
|
|
import torch |
|
from transformers import AutoTokenizer, AutoModel |
|
|
|
from .selfies_model.load import SELFIES as bart |
|
from .mhg_model import load as mhg |
|
from .smi_ted.smi_ted_light.load import load_smi_ted |
|
|
|
datasets = {} |
|
models = {} |
|
downstream_models ={} |
|
|
|
|
|
def avail_models_data(): |
|
global datasets |
|
global models |
|
|
|
datasets = [{"Dataset": "hiv", "Input": "smiles", "Output": "HIV_active", "Path": "data/hiv", "Timestamp": "2024-06-26 11:27:37"}, |
|
{"Dataset": "esol", "Input": "smiles", "Output": "ESOL predicted log solubility in mols per litre", "Path": "data/esol", "Timestamp": "2024-06-26 11:31:46"}, |
|
{"Dataset": "freesolv", "Input": "smiles", "Output": "expt", "Path": "data/freesolv", "Timestamp": "2024-06-26 11:33:47"}, |
|
{"Dataset": "lipo", "Input": "smiles", "Output": "y", "Path": "data/lipo", "Timestamp": "2024-06-26 11:34:37"}, |
|
{"Dataset": "bace", "Input": "smiles", "Output": "Class", "Path": "data/bace", "Timestamp": "2024-06-26 11:36:40"}, |
|
{"Dataset": "bbbp", "Input": "smiles", "Output": "p_np", "Path": "data/bbbp", "Timestamp": "2024-06-26 11:39:23"}, |
|
{"Dataset": "clintox", "Input": "smiles", "Output": "CT_TOX", "Path": "data/clintox", "Timestamp": "2024-06-26 11:42:43"}] |
|
|
|
|
|
models = [{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality", "Timestamp": "2024-06-21 12:32:20"}, |
|
{"Name": "mol-xl","Model Name": "Molformer", "Description": "MolFormer model for string based SMILES modality", "Timestamp": "2024-06-21 12:35:56"}, |
|
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model", "Timestamp": "2024-07-10 00:09:42"}, |
|
{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model", "Timestamp": "2024-07-10 00:09:42"}] |
|
|
|
|
|
def avail_models(raw=False): |
|
global models |
|
|
|
models = [{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model"}, |
|
{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality"}, |
|
{"Name": "mol-xl","Model Name": "Molformer", "Description": "MolFormer model for string based SMILES modality"}, |
|
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model"}, |
|
] |
|
|
|
|
|
|
|
if raw: return models |
|
else: |
|
return pd.DataFrame(models).drop('Name', axis=1) |
|
|
|
return models |
|
|
|
def avail_downstream_models(): |
|
global downstream_models |
|
|
|
with open("downstream_models.json", "r") as outfile: |
|
downstream_models = json.load(outfile) |
|
return downstream_models |
|
|
|
def avail_datasets(): |
|
global datasets |
|
|
|
datasets = [{"Dataset": "hiv", "Input": "smiles", "Output": "HIV_active", "Path": "data/hiv", |
|
"Timestamp": "2024-06-26 11:27:37"}, |
|
{"Dataset": "esol", "Input": "smiles", "Output": "ESOL predicted log solubility in mols per litre", |
|
"Path": "data/esol", "Timestamp": "2024-06-26 11:31:46"}, |
|
{"Dataset": "freesolv", "Input": "smiles", "Output": "expt", "Path": "data/freesolv", |
|
"Timestamp": "2024-06-26 11:33:47"}, |
|
{"Dataset": "lipo", "Input": "smiles", "Output": "y", "Path": "data/lipo", |
|
"Timestamp": "2024-06-26 11:34:37"}, |
|
{"Dataset": "bace", "Input": "smiles", "Output": "Class", "Path": "data/bace", |
|
"Timestamp": "2024-06-26 11:36:40"}, |
|
{"Dataset": "bbbp", "Input": "smiles", "Output": "p_np", "Path": "data/bbbp", |
|
"Timestamp": "2024-06-26 11:39:23"}, |
|
{"Dataset": "clintox", "Input": "smiles", "Output": "CT_TOX", "Path": "data/clintox", |
|
"Timestamp": "2024-06-26 11:42:43"}] |
|
|
|
return datasets |
|
|
|
def reset(): |
|
|
|
"""datasets = {"esol": ["smiles", "ESOL predicted log solubility in mols per litre", "data/esol", "2024-06-26 11:36:46.509324"], |
|
"freesolv": ["smiles", "expt", "data/freesolv", "2024-06-26 11:37:37.393273"], |
|
"lipo": ["smiles", "y", "data/lipo", "2024-06-26 11:37:37.393273"], |
|
"hiv": ["smiles", "HIV_active", "data/hiv", "2024-06-26 11:37:37.393273"], |
|
"bace": ["smiles", "Class", "data/bace", "2024-06-26 11:38:40.058354"], |
|
"bbbp": ["smiles", "p_np", "data/bbbp","2024-06-26 11:38:40.058354"], |
|
"clintox": ["smiles", "CT_TOX", "data/clintox","2024-06-26 11:38:40.058354"], |
|
"sider": ["smiles","1:", "data/sider","2024-06-26 11:38:40.058354"], |
|
"tox21": ["smiles",":-2", "data/tox21","2024-06-26 11:38:40.058354"] |
|
}""" |
|
|
|
datasets = [ |
|
{"Dataset": "hiv", "Input": "smiles", "Output": "HIV_active", "Path": "data/hiv", "Timestamp": "2024-06-26 11:27:37"}, |
|
{"Dataset": "esol", "Input": "smiles", "Output": "ESOL predicted log solubility in mols per litre", "Path": "data/esol", "Timestamp": "2024-06-26 11:31:46"}, |
|
{"Dataset": "freesolv", "Input": "smiles", "Output": "expt", "Path": "data/freesolv", "Timestamp": "2024-06-26 11:33:47"}, |
|
{"Dataset": "lipo", "Input": "smiles", "Output": "y", "Path": "data/lipo", "Timestamp": "2024-06-26 11:34:37"}, |
|
{"Dataset": "bace", "Input": "smiles", "Output": "Class", "Path": "data/bace", "Timestamp": "2024-06-26 11:36:40"}, |
|
{"Dataset": "bbbp", "Input": "smiles", "Output": "p_np", "Path": "data/bbbp", "Timestamp": "2024-06-26 11:39:23"}, |
|
{"Dataset": "clintox", "Input": "smiles", "Output": "CT_TOX", "Path": "data/clintox", "Timestamp": "2024-06-26 11:42:43"}, |
|
|
|
|
|
] |
|
|
|
models = [{"Name": "bart", "Description": "BART model for string based SELFIES modality", |
|
"Timestamp": "2024-06-21 12:32:20"}, |
|
{"Name": "mol-xl", "Description": "MolFormer model for string based SMILES modality", |
|
"Timestamp": "2024-06-21 12:35:56"}, |
|
{"Name": "mhg", "Description": "MHG", "Timestamp": "2024-07-10 00:09:42"}, |
|
{"Name": "spec-gru", "Description": "Spectrum modality with GRU", "Timestamp": "2024-07-10 00:09:42"}, |
|
{"Name": "spec-lstm", "Description": "Spectrum modality with LSTM", "Timestamp": "2024-07-10 00:09:54"}, |
|
{"Name": "3d-vae", "Description": "VAE model for 3D atom positions", "Timestamp": "2024-07-10 00:10:08"}] |
|
|
|
|
|
downstream_models = [ |
|
{"Name": "XGBClassifier", "Description": "XG Boost Classifier", |
|
"Timestamp": "2024-06-21 12:31:20"}, |
|
{"Name": "XGBRegressor", "Description": "XG Boost Regressor", |
|
"Timestamp": "2024-06-21 12:32:56"}, |
|
{"Name": "2-FNN", "Description": "A two layer feedforward network", |
|
"Timestamp": "2024-06-24 14:34:16"}, |
|
{"Name": "3-FNN", "Description": "A three layer feedforward network", |
|
"Timestamp": "2024-06-24 14:38:37"}, |
|
] |
|
|
|
with open("datasets.json", "w") as outfile: |
|
json.dump(datasets, outfile) |
|
|
|
with open("models.json", "w") as outfile: |
|
json.dump(models, outfile) |
|
|
|
with open("downstream_models.json", "w") as outfile: |
|
json.dump(downstream_models, outfile) |
|
|
|
def update_data_list(list_data): |
|
|
|
|
|
with open("datasets.json", "w") as outfile: |
|
json.dump(datasets, outfile) |
|
|
|
avail_models_data() |
|
|
|
def update_model_list(list_model): |
|
|
|
|
|
with open("models.json", "w") as outfile: |
|
json.dump(list_model, outfile) |
|
|
|
avail_models_data() |
|
|
|
def update_downstream_model_list(list_model): |
|
|
|
|
|
with open("downstream_models.json", "w") as outfile: |
|
json.dump(list_model, outfile) |
|
|
|
avail_models_data() |
|
|
|
avail_models_data() |
|
|
|
def get_representation(train_data,test_data,model_type, return_tensor=True): |
|
alias = {"MHG-GED": "mhg", "SELFIES-TED": "bart", "MolFormer": "mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"} |
|
if model_type in alias.keys(): |
|
model_type = alias[model_type] |
|
|
|
if model_type == "mhg": |
|
model = mhg.load("models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle") |
|
with torch.no_grad(): |
|
train_emb = model.encode(train_data) |
|
x_batch = torch.stack(train_emb) |
|
|
|
test_emb = model.encode(test_data) |
|
x_batch_test = torch.stack(test_emb) |
|
if not return_tensor: |
|
x_batch = pd.DataFrame(x_batch) |
|
x_batch_test = pd.DataFrame(x_batch_test) |
|
|
|
|
|
|
|
elif model_type == "bart": |
|
model = bart() |
|
model.load() |
|
x_batch = model.encode(train_data, return_tensor=return_tensor) |
|
x_batch_test = model.encode(test_data, return_tensor=return_tensor) |
|
|
|
elif model_type == "smi-ted": |
|
model = load_smi_ted(folder='./models/smi_ted/smi_ted_light', ckpt_filename='smi-ted-Light_40.pt') |
|
with torch.no_grad(): |
|
x_batch = model.encode(train_data, return_torch=return_tensor) |
|
x_batch_test = model.encode(test_data, return_torch=return_tensor) |
|
|
|
elif model_type == "mol-xl": |
|
model = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, |
|
trust_remote_code=True) |
|
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True) |
|
|
|
if type(train_data) == list: |
|
inputs = tokenizer(train_data, padding=True, return_tensors="pt") |
|
else: |
|
inputs = tokenizer(list(train_data.values), padding=True, return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
x_batch = outputs.pooler_output |
|
|
|
if type(test_data) == list: |
|
inputs = tokenizer(test_data, padding=True, return_tensors="pt") |
|
else: |
|
inputs = tokenizer(list(test_data.values), padding=True, return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
x_batch_test = outputs.pooler_output |
|
|
|
if not return_tensor: |
|
x_batch = pd.DataFrame(x_batch) |
|
x_batch_test = pd.DataFrame(x_batch_test) |
|
|
|
|
|
return x_batch, x_batch_test |
|
|
|
def single_modal(model,dataset, downstream_model,params): |
|
print(model) |
|
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED": "smi-ted"} |
|
data = avail_models(raw=True) |
|
df = pd.DataFrame(data) |
|
print(list(df["Name"].values)) |
|
if alias[model] in list(df["Name"].values): |
|
if model in alias.keys(): |
|
model_type = alias[model] |
|
else: |
|
model_type = model |
|
else: |
|
print("Model not available") |
|
return |
|
|
|
data = avail_datasets() |
|
df = pd.DataFrame(data) |
|
print(list(df["Dataset"].values)) |
|
|
|
if dataset in list(df["Dataset"].values): |
|
task = dataset |
|
with open(f"./representation/{task}_{model_type}.pkl", "rb") as f1: |
|
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1) |
|
print(f" Representation loaded successfully") |
|
else: |
|
|
|
print("Custom Dataset") |
|
|
|
components = dataset.split(",") |
|
train_data = pd.read_csv(components[0])[components[2]] |
|
test_data = pd.read_csv(components[1])[components[2]] |
|
|
|
y_batch = pd.read_csv(components[0])[components[3]] |
|
y_batch_test = pd.read_csv(components[1])[components[3]] |
|
|
|
|
|
x_batch, x_batch_test = get_representation(train_data,test_data,model_type) |
|
|
|
|
|
|
|
print(f" Representation loaded successfully") |
|
|
|
|
|
|
|
|
|
|
|
print(f" Calculating ROC AUC Score ...") |
|
|
|
if downstream_model == "XGBClassifier": |
|
xgb_predict_concat = XGBClassifier(**params) |
|
xgb_predict_concat.fit(x_batch, y_batch) |
|
|
|
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1] |
|
|
|
roc_auc = roc_auc_score(y_batch_test, y_prob) |
|
fpr, tpr, _ = roc_curve(y_batch_test, y_prob) |
|
print(f"ROC-AUC Score: {roc_auc:.4f}") |
|
|
|
try: |
|
with open(f"./plot_emb/{task}_{model_type}.pkl", "rb") as f1: |
|
class_0,class_1 = pickle.load(f1) |
|
except: |
|
print("Generating latent plots") |
|
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1, |
|
verbose=False) |
|
n_samples = np.minimum(1000, len(x_batch)) |
|
features_umap = reducer.fit_transform(x_batch[:n_samples]) |
|
try:x = y_batch.values[:n_samples] |
|
except: x = y_batch[:n_samples] |
|
index_0 = [index for index in range(len(x)) if x[index] == 0] |
|
index_1 = [index for index in range(len(x)) if x[index] == 1] |
|
|
|
class_0 = features_umap[index_0] |
|
class_1 = features_umap[index_1] |
|
print("Generating latent plots : Done") |
|
|
|
|
|
|
|
result = f"ROC-AUC Score: {roc_auc:.4f}" |
|
|
|
return result, roc_auc,fpr, tpr, class_0, class_1 |
|
|
|
elif downstream_model == "DefaultClassifier": |
|
xgb_predict_concat = XGBClassifier() |
|
xgb_predict_concat.fit(x_batch, y_batch) |
|
|
|
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1] |
|
|
|
roc_auc = roc_auc_score(y_batch_test, y_prob) |
|
fpr, tpr, _ = roc_curve(y_batch_test, y_prob) |
|
print(f"ROC-AUC Score: {roc_auc:.4f}") |
|
|
|
try: |
|
with open(f"./plot_emb/{task}_{model_type}.pkl", "rb") as f1: |
|
class_0,class_1 = pickle.load(f1) |
|
except: |
|
print("Generating latent plots") |
|
reducer = umap.UMAP(metric='euclidean', n_neighbors= 10, n_components=2, low_memory=True, min_dist=0.1, verbose=False) |
|
n_samples = np.minimum(1000,len(x_batch)) |
|
features_umap = reducer.fit_transform(x_batch[:n_samples]) |
|
try:x = y_batch.values[:n_samples] |
|
except:x = y_batch[:n_samples] |
|
index_0 = [index for index in range(len(x)) if x[index] == 0] |
|
index_1 = [index for index in range(len(x)) if x[index] == 1] |
|
|
|
class_0 = features_umap[index_0] |
|
class_1 = features_umap[index_1] |
|
print("Generating latent plots : Done") |
|
|
|
|
|
|
|
result = f"ROC-AUC Score: {roc_auc:.4f}" |
|
|
|
return result, roc_auc,fpr, tpr, class_0, class_1 |
|
|
|
elif downstream_model == "SVR": |
|
regressor = SVR(**params) |
|
model = TransformedTargetRegressor(regressor= regressor, |
|
transformer = MinMaxScaler(feature_range=(-1, 1)) |
|
).fit(x_batch,y_batch) |
|
|
|
y_prob = model.predict(x_batch_test) |
|
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob)) |
|
|
|
print(f"RMSE Score: {RMSE_score:.4f}") |
|
result = f"RMSE Score: {RMSE_score:.4f}" |
|
|
|
print("Generating latent plots") |
|
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1, |
|
verbose=False) |
|
n_samples = np.minimum(1000, len(x_batch)) |
|
features_umap = reducer.fit_transform(x_batch[:n_samples]) |
|
try:x = y_batch.values[:n_samples] |
|
except:x = y_batch[:n_samples] |
|
|
|
|
|
|
|
class_0 = features_umap |
|
class_1 = features_umap |
|
print("Generating latent plots : Done") |
|
|
|
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1 |
|
|
|
elif downstream_model == "Kernel Ridge": |
|
regressor = KernelRidge(**params) |
|
model = TransformedTargetRegressor(regressor=regressor, |
|
transformer=MinMaxScaler(feature_range=(-1, 1)) |
|
).fit(x_batch, y_batch) |
|
|
|
y_prob = model.predict(x_batch_test) |
|
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob)) |
|
|
|
print(f"RMSE Score: {RMSE_score:.4f}") |
|
result = f"RMSE Score: {RMSE_score:.4f}" |
|
|
|
print("Generating latent plots") |
|
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1, |
|
verbose=False) |
|
n_samples = np.minimum(1000, len(x_batch)) |
|
features_umap = reducer.fit_transform(x_batch[:n_samples]) |
|
try:x = y_batch.values[:n_samples] |
|
except:x = y_batch[:n_samples] |
|
|
|
|
|
|
|
class_0 = features_umap |
|
class_1 = features_umap |
|
print("Generating latent plots : Done") |
|
|
|
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1 |
|
|
|
|
|
elif downstream_model == "Linear Regression": |
|
regressor = LinearRegression(**params) |
|
model = TransformedTargetRegressor(regressor=regressor, |
|
transformer=MinMaxScaler(feature_range=(-1, 1)) |
|
).fit(x_batch, y_batch) |
|
|
|
y_prob = model.predict(x_batch_test) |
|
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob)) |
|
|
|
print(f"RMSE Score: {RMSE_score:.4f}") |
|
result = f"RMSE Score: {RMSE_score:.4f}" |
|
|
|
print("Generating latent plots") |
|
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1, |
|
verbose=False) |
|
n_samples = np.minimum(1000, len(x_batch)) |
|
features_umap = reducer.fit_transform(x_batch[:n_samples]) |
|
try:x = y_batch.values[:n_samples] |
|
except:x = y_batch[:n_samples] |
|
|
|
|
|
|
|
class_0 = features_umap |
|
class_1 = features_umap |
|
print("Generating latent plots : Done") |
|
|
|
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1 |
|
|
|
|
|
elif downstream_model == "DefaultRegressor": |
|
regressor = SVR(kernel="rbf", degree=3, C=5, gamma="scale", epsilon=0.01) |
|
model = TransformedTargetRegressor(regressor=regressor, |
|
transformer=MinMaxScaler(feature_range=(-1, 1)) |
|
).fit(x_batch, y_batch) |
|
|
|
y_prob = model.predict(x_batch_test) |
|
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob)) |
|
|
|
print(f"RMSE Score: {RMSE_score:.4f}") |
|
result = f"RMSE Score: {RMSE_score:.4f}" |
|
|
|
print("Generating latent plots") |
|
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1, |
|
verbose=False) |
|
n_samples = np.minimum(1000, len(x_batch)) |
|
features_umap = reducer.fit_transform(x_batch[:n_samples]) |
|
try:x = y_batch.values[:n_samples] |
|
except:x = y_batch[:n_samples] |
|
|
|
|
|
|
|
class_0 = features_umap |
|
class_1 = features_umap |
|
print("Generating latent plots : Done") |
|
|
|
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1 |
|
|
|
|
|
def multi_modal(model_list,dataset, downstream_model,params): |
|
print(model_list) |
|
data = avail_datasets() |
|
df = pd.DataFrame(data) |
|
list(df["Dataset"].values) |
|
|
|
if dataset in list(df["Dataset"].values): |
|
task = dataset |
|
predefined = True |
|
else: |
|
predefined = False |
|
components = dataset.split(",") |
|
train_data = pd.read_csv(components[0])[components[2]] |
|
test_data = pd.read_csv(components[1])[components[2]] |
|
|
|
y_batch = pd.read_csv(components[0])[components[3]] |
|
y_batch_test = pd.read_csv(components[1])[components[3]] |
|
|
|
print("Custom Dataset loaded") |
|
|
|
|
|
data = avail_models(raw=True) |
|
df = pd.DataFrame(data) |
|
list(df["Name"].values) |
|
|
|
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED":"smi-ted"} |
|
|
|
if set(model_list).issubset(list(alias.keys())): |
|
for i, model in enumerate(model_list): |
|
if model in alias.keys(): |
|
model_type = alias[model] |
|
else: |
|
model_type = model |
|
|
|
if i == 0: |
|
if predefined: |
|
with open(f"./representation/{task}_{model_type}.pkl", "rb") as f1: |
|
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1) |
|
print(f" Loaded representation/{task}_{model_type}.pkl") |
|
else: |
|
x_batch, x_batch_test = get_representation(train_data, test_data, model_type) |
|
x_batch = pd.DataFrame(x_batch) |
|
x_batch_test = pd.DataFrame(x_batch_test) |
|
|
|
else: |
|
if predefined: |
|
with open(f"./representation/{task}_{model_type}.pkl", "rb") as f1: |
|
x_batch_1, y_batch_1, x_batch_test_1, y_batch_test_1 = pickle.load(f1) |
|
print(f" Loaded representation/{task}_{model_type}.pkl") |
|
else: |
|
x_batch_1, x_batch_test_1 = get_representation(train_data, test_data, model_type) |
|
x_batch_1 = pd.DataFrame(x_batch_1) |
|
x_batch_test_1 = pd.DataFrame(x_batch_test_1) |
|
|
|
x_batch = pd.concat([x_batch, x_batch_1], axis=1) |
|
x_batch_test = pd.concat([x_batch_test, x_batch_test_1], axis=1) |
|
|
|
|
|
else: |
|
print("Model not available") |
|
return |
|
|
|
num_columns = x_batch_test.shape[1] |
|
x_batch_test.columns = [f'{i + 1}' for i in range(num_columns)] |
|
|
|
num_columns = x_batch.shape[1] |
|
x_batch.columns = [f'{i + 1}' for i in range(num_columns)] |
|
|
|
|
|
print(f"Representations loaded successfully") |
|
try: |
|
with open(f"./plot_emb/{task}_multi.pkl", "rb") as f1: |
|
class_0, class_1 = pickle.load(f1) |
|
except: |
|
print("Generating latent plots") |
|
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1, |
|
verbose=False) |
|
n_samples = np.minimum(1000, len(x_batch)) |
|
features_umap = reducer.fit_transform(x_batch[:n_samples]) |
|
|
|
if "Classifier" in downstream_model: |
|
try:x = y_batch.values[:n_samples] |
|
except:x = y_batch[:n_samples] |
|
index_0 = [index for index in range(len(x)) if x[index] == 0] |
|
index_1 = [index for index in range(len(x)) if x[index] == 1] |
|
|
|
class_0 = features_umap[index_0] |
|
class_1 = features_umap[index_1] |
|
|
|
else: |
|
class_0 = features_umap |
|
class_1 = features_umap |
|
|
|
print("Generating latent plots : Done") |
|
|
|
print(f" Calculating ROC AUC Score ...") |
|
|
|
|
|
if downstream_model == "XGBClassifier": |
|
xgb_predict_concat = XGBClassifier(**params) |
|
xgb_predict_concat.fit(x_batch, y_batch) |
|
|
|
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1] |
|
|
|
|
|
roc_auc = roc_auc_score(y_batch_test, y_prob) |
|
fpr, tpr, _ = roc_curve(y_batch_test, y_prob) |
|
print(f"ROC-AUC Score: {roc_auc:.4f}") |
|
|
|
|
|
|
|
|
|
print(f"ROC-AUC Score: {roc_auc:.4f}") |
|
result = f"ROC-AUC Score: {roc_auc:.4f}" |
|
|
|
return result, roc_auc,fpr, tpr, class_0, class_1 |
|
|
|
elif downstream_model == "DefaultClassifier": |
|
xgb_predict_concat = XGBClassifier() |
|
xgb_predict_concat.fit(x_batch, y_batch) |
|
|
|
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1] |
|
|
|
|
|
roc_auc = roc_auc_score(y_batch_test, y_prob) |
|
fpr, tpr, _ = roc_curve(y_batch_test, y_prob) |
|
print(f"ROC-AUC Score: {roc_auc:.4f}") |
|
|
|
|
|
|
|
|
|
print(f"ROC-AUC Score: {roc_auc:.4f}") |
|
result = f"ROC-AUC Score: {roc_auc:.4f}" |
|
|
|
return result, roc_auc,fpr, tpr, class_0, class_1 |
|
|
|
elif downstream_model == "SVR": |
|
regressor = SVR(**params) |
|
model = TransformedTargetRegressor(regressor= regressor, |
|
transformer = MinMaxScaler(feature_range=(-1, 1)) |
|
).fit(x_batch,y_batch) |
|
|
|
y_prob = model.predict(x_batch_test) |
|
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob)) |
|
|
|
print(f"RMSE Score: {RMSE_score:.4f}") |
|
result = f"RMSE Score: {RMSE_score:.4f}" |
|
|
|
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1 |
|
|
|
elif downstream_model == "Linear Regression": |
|
regressor = LinearRegression(**params) |
|
model = TransformedTargetRegressor(regressor=regressor, |
|
transformer=MinMaxScaler(feature_range=(-1, 1)) |
|
).fit(x_batch, y_batch) |
|
|
|
y_prob = model.predict(x_batch_test) |
|
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob)) |
|
|
|
print(f"RMSE Score: {RMSE_score:.4f}") |
|
result = f"RMSE Score: {RMSE_score:.4f}" |
|
|
|
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1 |
|
|
|
elif downstream_model == "Kernel Ridge": |
|
regressor = KernelRidge(**params) |
|
model = TransformedTargetRegressor(regressor=regressor, |
|
transformer=MinMaxScaler(feature_range=(-1, 1)) |
|
).fit(x_batch, y_batch) |
|
|
|
y_prob = model.predict(x_batch_test) |
|
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob)) |
|
|
|
print(f"RMSE Score: {RMSE_score:.4f}") |
|
result = f"RMSE Score: {RMSE_score:.4f}" |
|
|
|
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1 |
|
|
|
elif downstream_model == "DefaultRegressor": |
|
regressor = SVR(kernel="rbf", degree=3, C=5, gamma="scale", epsilon=0.01) |
|
model = TransformedTargetRegressor(regressor=regressor, |
|
transformer=MinMaxScaler(feature_range=(-1, 1)) |
|
).fit(x_batch, y_batch) |
|
|
|
y_prob = model.predict(x_batch_test) |
|
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob)) |
|
|
|
print(f"RMSE Score: {RMSE_score:.4f}") |
|
result = f"RMSE Score: {RMSE_score:.4f}" |
|
|
|
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|