Upload folder using huggingface_hub

7718235 verified 10 months ago

4.83 kB

	library(ggplot2)
	task.dic <- list("Stab"=c("score.1"="stability.1", "score.2"="stability.2"))
	py.path <- '/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python'
	alphabet_premode <- c('L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
	'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C')
	genes <- c("Stab")
	scores <- c('AlphaMissense', 'gMVP', 'PrimateAI', 'REVEL', 'ESM1b.LLR', 'FoldXddG')
	models <- c('PreMode/', 'ESM.SLP/')
	models.dic <- c('PreMode/'='PreMode', "ESM.SLP/"='ESM+SLP')
	# add baseline AUC
	# esm alphabets
	source('./AUROC.R')
	alphabet <- c('<cls>', '<pad>', '<eos>', '<unk>',
	'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
	'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C',
	'X', 'B', 'U', 'Z', 'O', '.', '-',
	'<null_1>', '<mask>')
	# first plot PreMode pretrained auc vs other scores
	result.df <- NULL
	scores <- c(scores, models)
	for (i in 1:length(genes)) {
	for (fold in 0:4) {
	dms.df <- read.csv(paste0('PreMode/', genes[i], '/',
	'/test.fold.', fold, '.annotated.csv'))
	# calculate R2
	stab.r <- NULL
	other.r <- NULL
	for (score in scores) {
	if (score %in% models) {
	dms.df <- read.csv(paste0(score, genes[i], '/',
	'/testing.fold.', fold, '.csv'))
	all.r <- abs(plot.R2(dms.df[,names(task.dic[[genes[i]]])],
	dms.df[,paste0('logits.', 1:length(task.dic[[genes[i]]])-1)])$R2)
	} else {
	all.r <- abs(plot.R2(dms.df[,names(task.dic[[genes[i]]])],
	dms.df[,rep(score, length(task.dic[[genes[i]]]))])$R2)
	}
	stab.r <- c(stab.r, mean(all.r))
	}
	model.names <- scores
	model.names[model.names %in% models] <- models.dic[model.names[model.names %in% models]]
	result.df <- rbind(result.df,
	data.frame(model=model.names,
	HGNC=genes[i],
	fold=fold,
	npoints=dim(dms.df)[1],
	stab.rho=stab.r))
	# add biochem properties
	# write train and test emb to files
	dms.train.df <- read.csv(paste0('PreMode/', genes[i], '/',
	'/train.fold.', fold, '.annotated.csv'))
	dms.df <- read.csv(paste0('PreMode/', genes[i], '/',
	'/test.fold.', fold, '.annotated.csv'))
	dms.train.df <- prepare.unique.id(dms.train.df)
	dms.df <- prepare.unique.id(dms.df)
	# get train and test biochemical
	gene.train.biochem <- prepare.biochemical(dms.train.df)
	gene.test.biochem <- prepare.biochemical(dms.df)
	# write train and test emb to files
	train.label.file <- tempfile()
	test.label.file <- tempfile()
	train.biochem.file <- tempfile()
	test.biochem.file <- tempfile()
	write.csv(dms.train.df, file = train.label.file)
	write.csv(dms.df, file = test.label.file)
	write.csv(gene.train.biochem, file = train.biochem.file)
	write.csv(gene.test.biochem, file = test.biochem.file)
	res <- system(paste0(py.path, ' ',
	'elastic.net.dms.py ',
	train.biochem.file, ' ',
	train.label.file, ' ',
	test.biochem.file, ' ',
	test.label.file), intern = T)
	baseline.auc.3 <- list(R2=as.numeric(as.data.frame(strsplit(res, split = '='))[2,]))
	result.df <- rbind(result.df,
	data.frame(model=c('Elastic Net'),
	HGNC='Stab',
	fold=fold,
	npoints=dim(dms.df)[1],
	stab.rho=c(mean(baseline.auc.3$R2))))
	}
	}
	write.csv(result.df, './figs/fig.sup.6.csv')
	# plot the task weighted averages as well as task size weighted error bars
	uniq.result.plot <- result.df[result.df$fold==0,]
	for (i in 1:dim(uniq.result.plot)[1]) {
	uniq.result.plot$stab.rho[i] = mean(result.df$stab.rho[result.df$model==uniq.result.plot$model[i] &
	result.df$HGNC==uniq.result.plot$HGNC[i]], na.rm=T)
	uniq.result.plot$stab.rho.sd[i] = sd(result.df$stab.rho[result.df$model==uniq.result.plot$model[i] &
	result.df$HGNC==uniq.result.plot$HGNC[i]], na.rm=T)

	}
	p <- ggplot(uniq.result.plot, aes(x=stab.rho, y=model)) +
	geom_point() +
	# geom_errorbar(aes(ymin=other.rho-other.rho.sd, ymax=other.rho+other.rho.sd)) +
	geom_errorbarh(aes(xmin=stab.rho-stab.rho.sd, xmax=stab.rho+stab.rho.sd), height=.2) +
	# geom_abline(slope = 1, intercept = 0, linetype = "dashed", alpha=0.2) +
	scale_shape_manual(values = 11:18) +
	ggtitle("Spearman Correlation (5 Fold testing)") +
	theme_bw() + ggeasy::easy_center_title()
	ggsave('figs/fig.sup.6.pdf', p, height = 4, width = 5)