|
genes <- c("Q09428", "P15056", "O00555", "P21802", |
|
"Q14654", "P07949", "Q99250", "Q14524.clean", "P04637") |
|
gene.names <- c("ABCC8", "BRAF", "CACNA1A", "FGFR2", |
|
"KCNJ11", "RET", "SCN2A", "SCN5A", "TP53") |
|
py.path = '/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python' |
|
source('./AUROC.R') |
|
summary.df <- data.frame() |
|
plots <- list() |
|
source('./prepare.biochem.R') |
|
ALL <- read.csv('figs/ALL.csv', row.names = 1, na.strings = c('.', 'NA')) |
|
ALL <- prepare.unique.id(ALL) |
|
pick.cond <- 'auc' |
|
for (i in 1:length(genes)) { |
|
gene <- genes[i] |
|
for (subset in c(1,2,4,6,8)) { |
|
for (fold in 0:4) { |
|
aucs <- c() |
|
if (subset == 8) { |
|
gene.test.res <- read.csv(paste0('PreMode/', gene, '/testing.fold.' ,fold, '.4fold.csv')) |
|
log.yaml <- yaml::read_yaml(paste0('../scripts/PreMode/', |
|
gene, '.5fold/', gene, '.fold.', fold, '.yaml')) |
|
|
|
gene.test.res.lw <- read.csv(paste0('PreMode/', gene, '.large.window/testing.fold.' ,fold, '.4fold.csv')) |
|
gene.train.res <- read.csv(paste0('PreMode/', gene, '/training.fold.' ,fold, '.4fold.csv')) |
|
gene.train.res.lw <- read.csv(paste0('PreMode/', gene, '.large.window/training.fold.' ,fold, '.4fold.csv')) |
|
} else { |
|
gene.test.res <- read.csv(paste0('PreMode/', gene, '/testing.subset.', subset, '.fold.' ,fold, '.4fold.csv')) |
|
log.yaml <- yaml::read_yaml(paste0('../scripts/PreMode/', |
|
gene, '.subset.', subset, '.5fold/', gene, '.subset.', subset, '.fold.', fold, '.yaml')) |
|
|
|
gene.test.res.lw <- read.csv(paste0('PreMode/', gene, '.large.window/testing.subset.', subset, '.fold.' ,fold, '.4fold.csv')) |
|
gene.train.res <- read.csv(paste0('PreMode/', gene, '/training.subset.', subset, '.fold.' ,fold, '.4fold.csv')) |
|
gene.train.res.lw <- read.csv(paste0('PreMode/', gene, '.large.window/training.subset.', subset, '.fold.' ,fold, '.4fold.csv')) |
|
} |
|
tr.auc <- plot.AUC(gene.train.res$score, rowMeans(gene.train.res[,paste0('logits.FOLD.', 0:3)]))$auc |
|
tr.auc.lw <- plot.AUC(gene.train.res.lw$score, rowMeans(gene.train.res.lw[,paste0('logits.FOLD.', 0:3)]))$auc |
|
tr.loss <- rowMeans(gene.train.res[,paste0('min_loss.FOLD.', 0:3)])[1] |
|
tr.loss.lw <- rowMeans(gene.train.res.lw[,paste0('min_loss.FOLD.', 0:3)])[1] |
|
if (pick.cond == 'auc') { |
|
cond <- tr.auc.lw > tr.auc |
|
} else if (pick.cond == 'loss') { |
|
cond <- tr.loss > tr.loss.lw |
|
} else if (pick.cond == 'auc+loss') { |
|
cond <- tr.auc.lw/tr.loss.lw > tr.auc/tr.loss |
|
} else { |
|
cond <- F |
|
} |
|
|
|
if (cond) { |
|
auc <- plot.AUC(gene.test.res.lw$score, rowMeans(gene.test.res.lw[,paste0('logits.FOLD.', 0:3)])) |
|
} else { |
|
auc <- plot.AUC(gene.test.res$score, rowMeans(gene.test.res[,paste0('logits.FOLD.', 0:3)])) |
|
} |
|
aucs <- c(aucs, auc$auc) |
|
|
|
gene.train <- read.csv(paste0('../', log.yaml$data_file_train)) |
|
gene.test <- read.csv(paste0('../', log.yaml$data_file_test)) |
|
|
|
fold.splits <- reticulate::py_load_object(paste0('../', log.yaml$log_dir, '/fold_split.pkl')) |
|
|
|
gene.train <- prepare.unique.id(gene.train) |
|
gene.test <- prepare.unique.id(gene.test) |
|
train.biochem <- prepare.biochemical(ALL[match(gene.train$unique.id, ALL$unique.id),]) |
|
test.biochem <- prepare.biochemical(ALL[match(gene.test$unique.id, ALL$unique.id),]) |
|
rownames(train.biochem) <- gene.train[,1] |
|
rownames(test.biochem) <- gene.test[,1] |
|
rf.aucs <- c() |
|
for (f in 1:4) { |
|
|
|
val.gof.idx <- fold.splits[[1]][[f]] |
|
val.lof.idx <- fold.splits[[2]][[f]] |
|
train.idx <- !gene.train[,1] %in% c(val.gof.idx, val.lof.idx) |
|
|
|
train.biochem.file <- tempfile() |
|
test.biochem.file <- tempfile() |
|
train.label.file <- tempfile() |
|
test.label.file <- tempfile() |
|
output.file <- tempfile() |
|
write.csv(train.biochem[train.idx,], |
|
file = train.biochem.file) |
|
write.csv(test.biochem, |
|
file = test.biochem.file) |
|
write.csv(gene.train[train.idx,], file = train.label.file) |
|
write.csv(gene.test, file = test.label.file) |
|
|
|
res <- system(paste0(py.path, ' ', |
|
'random.forest.glof.py ', |
|
train.biochem.file, ' ', |
|
train.label.file, ' ', |
|
test.biochem.file, ' ', |
|
test.label.file), intern = T) |
|
rf.aucs <- c(rf.aucs, as.numeric(strsplit(res, split = '=')[[1]][2])) |
|
} |
|
aucs <- c(aucs, mean(el.aucs), mean(rf.aucs)) |
|
summary.df <- rbind(summary.df, |
|
data.frame(auc=aucs, |
|
use.lw=c(cond, NA), |
|
model=c('PreMode.transfer', 'random.forest'), |
|
seed=fold, |
|
gene=gene.names[i], |
|
subset=subset, |
|
ngof.train=sum(gene.train$score==1), |
|
nlof.train=sum(gene.train$score==-1), |
|
ngof.test=sum(gene.test$score==1), |
|
nlof.test=sum(gene.test$score==-1))) |
|
} |
|
} |
|
} |
|
write.csv(summary.df, file = 'figs/fig.5e.prepare.csv') |
|
library(ggplot2) |
|
|
|
summary.df <- read.csv('figs/fig.5e.prepare.csv', row.names = 1) |
|
plots <- list() |
|
library(patchwork) |
|
for (i in 1:length(genes)) { |
|
task <- gene.names[i] |
|
task.res <- summary.df[startsWith(summary.df$gene, task),] |
|
task.res <- task.res[,!is.na(task.res[1,])] |
|
task.plots <- list() |
|
data.points <- paste0(task.res$ngof.train[task.res$seed==0 & task.res$model=="PreMode.transfer"], |
|
" | ", |
|
task.res$nlof.train[task.res$seed==0 & task.res$model=="PreMode.transfer"]) |
|
num.models <- length(unique(summary.df$model)) |
|
p <- ggplot(task.res, aes(x=subset, y=auc, col=model)) + |
|
geom_point(alpha=0.2) + |
|
|
|
stat_smooth(geom='line', span=0.3, se = FALSE, alpha=0.5) + scale_y_continuous(breaks=seq(0.4, 1, 0.2), limits = c(0.4, 1.0)) + |
|
scale_x_continuous(breaks=c(1, 2, 4, 6, 8), |
|
labels=paste0(data.points, |
|
c(" (10%)", " (20%)", " (40%)", " (60%)", " (80%)"))) + |
|
stat_summary(data = task.res, |
|
aes(x=as.numeric((subset))+0.4*(as.numeric((model)))/num.models-0.2*(num.models+1)/num.models, |
|
y = auc, col=model), |
|
fun.data = mean_se, geom = "errorbar", width = 0.2) + |
|
stat_summary(data = task.res, |
|
aes(x=as.numeric((subset))+0.4*(as.numeric((model)))/num.models-0.2*(num.models+1)/num.models, |
|
y = auc, col=model), |
|
fun.data = mean_se, geom = "point") + |
|
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
|
ggtitle(paste0(task)) + ggeasy::easy_center_title() + xlab("training data size, format: GoF | LoF (%)") |
|
plots[[i]] <- p |
|
} |
|
library(patchwork) |
|
p <- plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] + plots[[9]] + plot_layout(ncol=3) |
|
|
|
summary.df <- read.csv('figs/fig.5e.prepare.csv', row.names = 1) |
|
summary.df <- summary.df[summary.df$model %in% c('PreMode.transfer', 'random.forest'),] |
|
model.dic <- c("PreMode.transfer"="Supervised: PreMode", |
|
"random.forest"="Supervised: Random Forest") |
|
summary.df$model <- model.dic[summary.df$model] |
|
summary.df$model <- factor(summary.df$model, levels = c("Supervised: PreMode", |
|
"Supervised: Random Forest")) |
|
gene.names <- unique(summary.df$gene) |
|
|
|
plots <- list() |
|
library(patchwork) |
|
for (i in 1:length(genes)) { |
|
task <- gene.names[i] |
|
task.res <- summary.df[startsWith(summary.df$gene, task),] |
|
task.res <- task.res[,!is.na(task.res[1,])] |
|
task.plots <- list() |
|
data.points <- paste0(task.res$ngof.train[task.res$seed==0 & task.res$model=="Supervised: PreMode"], |
|
" | ", |
|
task.res$nlof.train[task.res$seed==0 & task.res$model=="Supervised: PreMode"]) |
|
num.models <- length(unique(summary.df$model)) |
|
p <- ggplot(task.res, aes(x=subset, y=auc, col=model)) + |
|
geom_point(alpha=0) + |
|
|
|
stat_smooth(geom='line', span=0.3, se = FALSE, alpha=0.5) + |
|
scale_y_continuous(breaks=seq(0.4, 1, 0.2), limits = c(0.4, 1.0)) + |
|
scale_x_continuous(breaks=c(1, 2, 4, 6, 8), |
|
labels=paste0(data.points, |
|
c(" (10%)", " (20%)", " (40%)", " (60%)", " (80%)"))) + |
|
stat_summary(data = task.res, |
|
aes(x=as.numeric((subset))+0.4*(as.numeric((model)))/num.models-0.2*(num.models+1)/num.models, |
|
y = auc, col=model), |
|
fun.data = mean_se, geom = "errorbar", width = 0.2) + |
|
stat_summary(data = task.res, |
|
aes(x=as.numeric((subset))+0.4*(as.numeric((model)))/num.models-0.2*(num.models+1)/num.models, |
|
y = auc, col=model), |
|
fun.data = mean_se, geom = "point") + |
|
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1), |
|
legend.position="bottom", |
|
legend.direction="horizontal") + |
|
ggtitle(paste0(task)) + ggeasy::easy_center_title() + xlab("training data size, format: GoF | LoF (%)") |
|
if (i != 5) { |
|
p <- p + guides(color=FALSE) |
|
} |
|
plots[[i]] <- p |
|
} |
|
library(ggpubr) |
|
p <- ggarrange(plots[[6]], plots[[5]], plots[[3]], |
|
plots[[2]], plots[[8]], plots[[7]], |
|
plots[[9]], plots[[1]], plots[[4]], |
|
ncol=3, nrow=3, common.legend = TRUE, legend="bottom") |
|
|
|
|
|
summary.df <- read.csv('figs/fig.5e.prepare.csv', row.names = 1) |
|
summary.df <- summary.df[summary.df$model %in% c('PreMode.transfer', 'random.forest'),] |
|
model.dic <- c("PreMode.transfer"="PreMode", |
|
"random.forest"="Random Forest") |
|
summary.df$model <- model.dic[summary.df$model] |
|
summary.df$model <- factor(summary.df$model, levels = c("PreMode", "Random Forest")) |
|
|
|
uniq.result.plot <- summary.df[summary.df$seed==0,] |
|
for (i in 1:dim(uniq.result.plot)[1]) { |
|
aucs <- summary.df$auc[summary.df$model==uniq.result.plot$model[i] & |
|
summary.df$gene==uniq.result.plot$gene[i] & |
|
summary.df$subset==uniq.result.plot$subset[i]] |
|
|
|
uniq.result.plot$auc[i] = mean(aucs, na.rm=T) |
|
uniq.result.plot$auc.se[i] = sd(aucs, na.rm=T) / sqrt(length(aucs)) |
|
} |
|
task.dic <- unique(uniq.result.plot$gene) |
|
plots <- list() |
|
num.models <- unique(uniq.result.plot$model) |
|
library(patchwork) |
|
for (i in 1:length(task.dic)) { |
|
task <- (genes)[i] |
|
task.res <- uniq.result.plot[uniq.result.plot$gene == gene.names[i],] |
|
task.res <- task.res[,!is.na(task.res[1,])] |
|
data.points <- paste0(task.res$ngof.train[task.res$seed==0 & task.res$model=="PreMode"], |
|
" | ", |
|
task.res$nlof.train[task.res$seed==0 & task.res$model=="PreMode"]) |
|
task.plots <- list() |
|
p <- ggplot(task.res, aes(x=subset, y=auc, col=model)) + |
|
geom_point() + |
|
geom_errorbar(aes(ymin=auc-auc.se, ymax=auc+auc.se), width=.4) + |
|
|
|
geom_line() + |
|
scale_y_continuous(breaks=seq(0.4, 1, 0.2), limits = c(0.4, 1.0)) + |
|
scale_x_continuous(breaks=c(1, 2, 4, 6, 8), |
|
labels=paste0(data.points, |
|
c(" (10%)", " (20%)", " (40%)", " (60%)", " (80%)"))) + |
|
ylab('Spearman rho') + |
|
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
|
ggtitle(paste0(task)) + ggeasy::easy_center_title() + xlab("training data size (%)") |
|
plots[[i]] <- p |
|
} |
|
library(patchwork) |
|
p <- ggarrange(plots[[6]], plots[[5]], plots[[3]], |
|
plots[[2]], plots[[8]], plots[[7]], |
|
plots[[9]], plots[[1]], plots[[4]], |
|
ncol=3, nrow=3, common.legend = TRUE, legend="bottom") |
|
|
|
|
|
uniq.model.result.plot <- uniq.result.plot[!duplicated(uniq.result.plot[,c('model', "subset")]),] |
|
for (i in 1:dim(uniq.model.result.plot)[1]) { |
|
aucs <- uniq.result.plot$auc[uniq.result.plot$model == uniq.model.result.plot$model[i] & |
|
uniq.result.plot$subset == uniq.model.result.plot$subset[i]] |
|
auc.ses <- uniq.result.plot$auc.se[uniq.result.plot$model == uniq.model.result.plot$model[i] & |
|
uniq.result.plot$subset == uniq.model.result.plot$subset[i]] |
|
model.gene.names <- gsub(":.*", "", uniq.result.plot$gene[uniq.result.plot$model == uniq.model.result.plot$model[i] & |
|
uniq.result.plot$subset == uniq.model.result.plot$subset[i]]) |
|
subsets <- uniq.result.plot$subset[uniq.result.plot$model == uniq.model.result.plot$model[i] & |
|
uniq.result.plot$subset == uniq.model.result.plot$subset[i]] |
|
|
|
ngof <- summary.df$ngof.train[summary.df$seed==0 & |
|
summary.df$model=="PreMode" & |
|
summary.df$subset == uniq.model.result.plot$subset[i]] |
|
nlof <- summary.df$nlof.train[summary.df$seed==0 & |
|
summary.df$model=="PreMode" & |
|
summary.df$subset == uniq.model.result.plot$subset[i]] |
|
data.points <- 1 / (1/ngof + 1/nlof) |
|
gene.ids <- genes[match(model.gene.names, gene.names)] |
|
|
|
uniq.model.result.plot$auc[i] <- sum(aucs * data.points, na.rm = T) / sum(data.points) |
|
uniq.model.result.plot$auc.se[i] <- sum(auc.ses * data.points, na.rm = T) / sum(data.points) |
|
} |
|
p <- ggplot(uniq.model.result.plot, aes(x=subset, y=auc, col=model)) + |
|
geom_point() + |
|
geom_errorbar(aes(ymin=auc-auc.se, ymax=auc+auc.se), width=.2) + |
|
geom_line() + |
|
scale_y_continuous(breaks=seq(0.4, 1, 0.2), limits = c(0.4, 1.0)) + |
|
scale_x_continuous(breaks=c(1, 2, 4, 6, 8), |
|
labels=paste0(c(" (10%)", " (20%)", " (40%)", " (60%)", " (80%)"))) + |
|
ylab('AUC') + |
|
theme_bw() + |
|
theme(axis.text.x = element_text(angle=60, vjust = 1, hjust = 1), |
|
text = element_text(size = 16), |
|
plot.title = element_text(size=15), |
|
legend.text = element_text(size=10), |
|
axis.title.x = element_text(size=12), |
|
legend.position="bottom", |
|
legend.direction="horizontal") + |
|
ggtitle("Weighted Average of Model AUC\non subsample of training") + |
|
ggeasy::easy_center_title() + xlab("training data size (% of full G/LoF dataset)") |
|
ggsave('figs/fig.5e.pdf', p, width = 4, height = 5) |
|
|
|
|