PreMode / analysis /funNCion /modelling_path_CACNA1SCN_4github.R
gzhong's picture
Upload folder using huggingface_hub
7718235 verified
#######################
# caret path vs neut #
#######################
# read and install libraries if necessary ####
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("data.table")) install.packages("data.table")
if (!require("caret")) install.packages("caret")
if (!require("doParallel")) install.packages("doParallel")
if (!require("mltools")) install.packages("mltools")
library(ggplot2)
library(data.table)
library(caret)
library(doParallel)
library("mltools")
# read files ####
featuretable <- fread("featuretable4github_revision.txt")
varall <- fread("SupplementaryTable_S1_pathvariantsusedintraining_revision.txt")
prettynames <- fread("pretty_featurenames2.txt")
gnomadnoneur2 <- fread("SupplTables/S2_neutralvariantsusedintraining3.txt")
# read functions ####
source("R_functions4predicting_path_GOFLOF_CACNA1SCN.r")
# format tables ####
# ...pathogenic variant table
varall <- varall[used_in_pathogenicity_prediction%in%1]
varall <- varall[!duplicated(varall[,c("gene", "altAA", "pos")])] # remove duplicate sites:
# ...neutral variant table
gnomadnoneur <- gnomadnoneur2[used_in_pathogenicity_prediction==1]
# ...feature table
# remove features
# rmv most correlated variables (as previously identified with caret preprocessing fcts)
featuretable[,(c("H", "caccon", "SF_DEKA")):=NULL]
# rmv protein/genomic positions, STRAND/transcript info, whether variant part of polyphen training set (inpp2):
featuretable[,(c("chr", "genomic_pos", "USED_REF", "alt", "pos","refAA", "altAA", "STRAND","Feature", "inpp2")):=NULL]
featuretable <- unique(featuretable)
# subset feature table to variants
# gnomad variants
featgn <- featuretable[match(gnomadnoneur$protid, protid)]
featgn$Class <- "neutral"
# pathogenic variants
feat <- featuretable[match(varall$protid, protid)]
feat$Class <- "pathogenic"
# remove genes with only neutral (no pathogenic) variants
featgn <- featgn[gene%in%feat$gene,]
# combine neutral + path variants
feat <- rbind(feat, featgn)
feat <- feat[complete.cases(feat),]
varallmod <- copy(feat)
# train model ####
outi <- predictpath(varallmod = varallmod, modeltype = "gbm")
model1 <- outi[[2]]
out <- outi[[1]]
# results ####
# "original"
modelperformance(out)
# Balanced_Accuracy Sens Spec AUC Precision Recall
# 0.8963905 0.9000000 0.8900000 0.9530308 0.9437229 0.8897959
# prAUC Kappa MCC
# 0.9385336 0.7744518 0.7768762
# predicting also variants in genes with only neutral variants ####
outbalancgenes <- out
# remake featgn obj with genes with no pathogenic variants
featgn <- featuretable[match(gnomadnoneur$protid, protid)]
testing <- featgn[!gene%in%varallmod$gene]
out <- data.frame(obs= "neutral",
neutral = predict(model1, newdata = testing, type = "prob")[,"neutral"],
pathogenic = predict(model1, newdata = testing, type = "prob")[,"pathogenic"],
pred= predict(model1, newdata = testing),
gene=testing$gene,
protid=testing$protid
)
out$obs <- factor(out$obs, levels = unique(out$pred))
outneutgeens <- out
#
vartable <- rbind(outbalancgenes, outneutgeens)
table(vartable[,c("obs", "pred")])
# pred
# obs neutral pathogenic
# neutral 1518 193
# pathogenic 13 121
modelperformance(vartable)
# Balanced_Accuracy Sens Spec AUC Precision Recall
# 0.8950928 0.9000000 0.8900000 0.9506268 0.9915088 0.8872005
# prAUC Kappa MCC
# 0.8241656 0.4880578 0.5457001
modelperformance(vartable[grep("SCN",vartable$gene),])
# Balanced_Accuracy Sens Spec AUC Precision Recall
# 0.8580825 0.9000000 0.8200000 0.9228044 0.9600000 0.8205128
# prAUC Kappa MCC
# 0.8736877 0.6232456 0.6447642
modelperformance(vartable[grep("CAC",vartable$gene),])
# Balanced_Accuracy Sens Spec AUC Precision Recall
# 0.9258901 0.9500000 0.9000000 0.9691563 0.9991877 0.9044118
# prAUC Kappa MCC
# 0.6417708 0.1959327 0.3207849
# plot Feature importance ####
importance_matrix <- base::summary(model1, plot=F)
colnames(importance_matrix) <- c("Feature", "Importance")
importance_matrix$Feature <- gsub("`","", importance_matrix$Feature) # weird formatting bug
importance_matrix <- importance_matrix[importance_matrix$Importance>0.05,]
importance_matrix$Feature <- prettynames[match(importance_matrix$Feature, feature_name)]$feature_name4plot
importance_matrix$Feature <- gsub(", DSSP","", importance_matrix$Feature) # rm DSSP
featimpxgb <- ggplot(importance_matrix,
aes(
x = factor(Feature, levels = rev(Feature)),
y = Importance, width = 0.3)
) +
geom_bar(fill ="#00000088", stat = "identity", position = "identity") +
ggplot2::coord_flip() +
xlab("Features")+
ylab("Relative Influence") +
ggtitle("Feature Importance") +
theme(plot.title = element_text(lineheight = 0.9,
face = "bold"), panel.grid.major.y = element_blank()) +
theme_bw()
featimpxgb