library(ggplot2) task.dic <- list("Stab"=c("score.1"="stability.1", "score.2"="stability.2")) py.path <- '/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python' alphabet_premode <- c('L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C') genes <- c("Stab") scores <- c('AlphaMissense', 'gMVP', 'PrimateAI', 'REVEL', 'ESM1b.LLR', 'FoldXddG') models <- c('PreMode/', 'ESM.SLP/') models.dic <- c('PreMode/'='PreMode', "ESM.SLP/"='ESM+SLP') # add baseline AUC # esm alphabets source('./AUROC.R') alphabet <- c('', '', '', '', 'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-', '', '') # first plot PreMode pretrained auc vs other scores result.df <- NULL scores <- c(scores, models) for (i in 1:length(genes)) { for (fold in 0:4) { dms.df <- read.csv(paste0('PreMode/', genes[i], '/', '/test.fold.', fold, '.annotated.csv')) # calculate R2 stab.r <- NULL other.r <- NULL for (score in scores) { if (score %in% models) { dms.df <- read.csv(paste0(score, genes[i], '/', '/testing.fold.', fold, '.csv')) all.r <- abs(plot.R2(dms.df[,names(task.dic[[genes[i]]])], dms.df[,paste0('logits.', 1:length(task.dic[[genes[i]]])-1)])$R2) } else { all.r <- abs(plot.R2(dms.df[,names(task.dic[[genes[i]]])], dms.df[,rep(score, length(task.dic[[genes[i]]]))])$R2) } stab.r <- c(stab.r, mean(all.r)) } model.names <- scores model.names[model.names %in% models] <- models.dic[model.names[model.names %in% models]] result.df <- rbind(result.df, data.frame(model=model.names, HGNC=genes[i], fold=fold, npoints=dim(dms.df)[1], stab.rho=stab.r)) # add biochem properties # write train and test emb to files dms.train.df <- read.csv(paste0('PreMode/', genes[i], '/', '/train.fold.', fold, '.annotated.csv')) dms.df <- read.csv(paste0('PreMode/', genes[i], '/', '/test.fold.', fold, '.annotated.csv')) dms.train.df <- prepare.unique.id(dms.train.df) dms.df <- prepare.unique.id(dms.df) # get train and test biochemical gene.train.biochem <- prepare.biochemical(dms.train.df) gene.test.biochem <- prepare.biochemical(dms.df) # write train and test emb to files train.label.file <- tempfile() test.label.file <- tempfile() train.biochem.file <- tempfile() test.biochem.file <- tempfile() write.csv(dms.train.df, file = train.label.file) write.csv(dms.df, file = test.label.file) write.csv(gene.train.biochem, file = train.biochem.file) write.csv(gene.test.biochem, file = test.biochem.file) res <- system(paste0(py.path, ' ', 'elastic.net.dms.py ', train.biochem.file, ' ', train.label.file, ' ', test.biochem.file, ' ', test.label.file), intern = T) baseline.auc.3 <- list(R2=as.numeric(as.data.frame(strsplit(res, split = '='))[2,])) result.df <- rbind(result.df, data.frame(model=c('Elastic Net'), HGNC='Stab', fold=fold, npoints=dim(dms.df)[1], stab.rho=c(mean(baseline.auc.3$R2)))) } } write.csv(result.df, './figs/fig.sup.6.csv') # plot the task weighted averages as well as task size weighted error bars uniq.result.plot <- result.df[result.df$fold==0,] for (i in 1:dim(uniq.result.plot)[1]) { uniq.result.plot$stab.rho[i] = mean(result.df$stab.rho[result.df$model==uniq.result.plot$model[i] & result.df$HGNC==uniq.result.plot$HGNC[i]], na.rm=T) uniq.result.plot$stab.rho.sd[i] = sd(result.df$stab.rho[result.df$model==uniq.result.plot$model[i] & result.df$HGNC==uniq.result.plot$HGNC[i]], na.rm=T) } p <- ggplot(uniq.result.plot, aes(x=stab.rho, y=model)) + geom_point() + # geom_errorbar(aes(ymin=other.rho-other.rho.sd, ymax=other.rho+other.rho.sd)) + geom_errorbarh(aes(xmin=stab.rho-stab.rho.sd, xmax=stab.rho+stab.rho.sd), height=.2) + # geom_abline(slope = 1, intercept = 0, linetype = "dashed", alpha=0.2) + scale_shape_manual(values = 11:18) + ggtitle("Spearman Correlation (5 Fold testing)") + theme_bw() + ggeasy::easy_center_title() ggsave('figs/fig.sup.6.pdf', p, height = 4, width = 5)