File size: 3,440 Bytes
7718235 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
#######################
# caret path vs neut #
#######################
# read and install libraries if necessary ####
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("data.table")) install.packages("data.table")
if (!require("caret")) install.packages("caret")
if (!require("doParallel")) install.packages("doParallel")
if (!require("mltools")) install.packages("mltools")
library(ggplot2)
library(data.table)
library(caret)
library(doParallel)
library("mltools")
# read files ####
featuretable <- fread("featuretable4github_revision.txt")
varall <- fread("SupplementaryTable_S1_pathvariantsusedintraining_revision2.txt")
prettynames <- fread("pretty_featurenames2.txt")
famcacscn <- as.data.frame(fread("scncacaa_familyalignedCACNA1Acantranscript.txt"))
# read functions ####
source("R_functions4predicting_goflof_CACNA1SCN.R")
# format tables ####
# ... variant table
varall <- varall[used_in_functional_prediction%in%1]
varall <- varall[prd_mech_revised%in%c("lof", "gof")]
# remove duplicate sites:
varall <- varall[!duplicated(varall[,c("gene", "altAA", "pos")])]
# ...feature table
# remove genomic positions, STRAND/transcript info, whether variant part of polyphen training set (inpp2):
featuretable[,(c("chr", "genomic_pos", "USED_REF", "STRAND","Feature", "inpp2")):=NULL]
featuretable[,(c(grep("dens", colnames(featuretable)))):=NULL] # remove all variant density features
# rmv most correlated variables (as previously identified with caret preprocessing fcts)
featuretable[,(c("H", "caccon", "SF_DEKA")):=NULL]
featuretable <- unique(featuretable)
# subset feature table to variants
feat <- featuretable[match(varall$protid, protid)] #, nomatch=0L
feat$Class <- varall$prd_mech_revised
feat <- feat[complete.cases(feat),]
varallmod <- as.data.frame(feat)
# train model ####
outi <- predictgof(varallmod = varallmod, modeltype = "gbm", featuretable = featuretable, alignmentfile = famcacscn)
model1 <- outi[[2]]
out <- outi[[1]]
write.csv(out, file = 'fuNCion.predictions.csv')
# results ####
# results in manuscript
modelperformance(out)
# Balanced_Accuracy Sens Spec AUC(=ROC) Precision
# 0.7990196 0.8300000 0.7600000 0.8464052 0.6756757
# Recall F1 prAUC Kappa MCC
# 0.8333333 0.7462687 0.7770830 0.5706268 0.5797599
# plot Feature importance ####
importance_matrix <- base::summary(model1, plot=F)
colnames(importance_matrix) <- c("Feature", "Importance")
importance_matrix$Feature <- gsub("`","", importance_matrix$Feature) # weird formatting bug
importance_matrix <- importance_matrix[importance_matrix$Importance>0.05,]
importance_matrix$Feature <- prettynames[match(importance_matrix$Feature, feature_name)]$feature_name4plot
importance_matrix$Feature <- gsub(", DSSP","", importance_matrix$Feature) # rm DSSP
featimpxgb <- ggplot(importance_matrix,
aes(
x = factor(Feature, levels = rev(Feature)),
y = Importance, width = 0.3)
) +
geom_bar(fill ="#00000088", stat = "identity", position = "identity") +
ggplot2::coord_flip() +
xlab("Features")+
ylab("Relative Influence") +
ggtitle("Feature Importance") +
theme(plot.title = element_text(lineheight = 0.9,
face = "bold"), panel.grid.major.y = element_blank()) +
theme_bw()
featimpxgb
|