File size: 3,440 Bytes
#######################
# caret path vs neut  #
#######################

# read and install libraries if necessary ####
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("data.table")) install.packages("data.table")
if (!require("caret")) install.packages("caret")
if (!require("doParallel")) install.packages("doParallel")
if (!require("mltools")) install.packages("mltools")
library(ggplot2)
library(data.table)
library(caret)
library(doParallel)
library("mltools")

# read files ####
featuretable <- fread("featuretable4github_revision.txt")
varall <- fread("SupplementaryTable_S1_pathvariantsusedintraining_revision2.txt")
prettynames <- fread("pretty_featurenames2.txt")
famcacscn <- as.data.frame(fread("scncacaa_familyalignedCACNA1Acantranscript.txt"))

# read functions ####
source("R_functions4predicting_goflof_CACNA1SCN.R")

# format tables ####

# ... variant table

varall <- varall[used_in_functional_prediction%in%1]
varall <- varall[prd_mech_revised%in%c("lof", "gof")]
# remove duplicate sites:
varall <- varall[!duplicated(varall[,c("gene", "altAA", "pos")])]

# ...feature table 

# remove genomic positions, STRAND/transcript info, whether variant part of polyphen training set (inpp2):
featuretable[,(c("chr", "genomic_pos", "USED_REF", "STRAND","Feature", "inpp2")):=NULL] 
featuretable[,(c(grep("dens", colnames(featuretable)))):=NULL] # remove all variant density features
# rmv most correlated variables (as previously identified with caret preprocessing fcts)
featuretable[,(c("H", "caccon", "SF_DEKA")):=NULL] 
featuretable <- unique(featuretable)

# subset feature table to variants
feat <- featuretable[match(varall$protid, protid)] #, nomatch=0L
feat$Class <- varall$prd_mech_revised
feat <- feat[complete.cases(feat),]
varallmod <- as.data.frame(feat)

# train model ####

outi <- predictgof(varallmod = varallmod, modeltype = "gbm", featuretable = featuretable, alignmentfile = famcacscn)

model1 <- outi[[2]]
out <- outi[[1]]
write.csv(out, file = 'fuNCion.predictions.csv')
# results ####

# results in manuscript
modelperformance(out)
# Balanced_Accuracy              Sens              Spec               AUC(=ROC)   Precision 
#         0.7990196         0.8300000         0.7600000         0.8464052         0.6756757 
# Recall                F1             prAUC             Kappa               MCC 
# 0.8333333         0.7462687         0.7770830         0.5706268         0.5797599 

# plot Feature importance ####

importance_matrix <- base::summary(model1, plot=F)
colnames(importance_matrix) <- c("Feature", "Importance")
importance_matrix$Feature <- gsub("`","", importance_matrix$Feature) # weird formatting bug
importance_matrix <- importance_matrix[importance_matrix$Importance>0.05,]
importance_matrix$Feature <- prettynames[match(importance_matrix$Feature, feature_name)]$feature_name4plot
importance_matrix$Feature <- gsub(", DSSP","", importance_matrix$Feature) # rm DSSP

featimpxgb <- ggplot(importance_matrix, 
                     aes(
  x = factor(Feature, levels = rev(Feature)), 
  y = Importance, width = 0.3) 
  ) + 
  geom_bar(fill ="#00000088", stat = "identity", position = "identity") + 
  ggplot2::coord_flip() + 
  xlab("Features")+ 
  ylab("Relative Influence") + 
  ggtitle("Feature Importance") +
  theme(plot.title = element_text(lineheight = 0.9, 
                      face = "bold"), panel.grid.major.y = element_blank()) + 
  theme_bw()
featimpxgb