Note: helper_functions codes were adopted from “https://github.com/favilaco/deconv_benchmark/Master_deconvolution.R”. Nature Communications; https://doi.org/10.1038/s41467-020-19015-1.
source('CIBERSORT.R')
source('helper_functions.R')
library(Matrix)
library(Biobase)
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:Matrix':
##
## which
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library("EnsDb.Hsapiens.v86")
## Loading required package: ensembldb
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:Matrix':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: AnnotationFilter
##
## Attaching package: 'ensembldb'
## The following object is masked from 'package:dplyr':
##
## filter
## The following object is masked from 'package:stats':
##
## filter
bulk: bulk RNA-seq count matrix; filtered.counts: snRNA-seq count matrix; filtered.colMetadata: meta file for snRNA-seq; markers
bulk = readRDS("counts.geneID.sinai_pipeline.RDS")
filtered.counts <- readMM("filtered_count_matrix.mtx")
rownames(filtered.counts) <- readLines("filtered_gene_row_names.txt")
filtered.colMetadata <- read.delim("filtered_column_metadata.txt")
markers_symbol = read.delim("markers_symbol.tsv")
##Preprocessing data matrix
bulk_exp = exprs(bulk)
meta = phenoData(bulk)@data
colnames(bulk_exp) = meta[colnames(bulk_exp),"projid"]
row.names(filtered.colMetadata) = filtered.colMetadata[,1]
colnames(filtered.counts) = row.names(filtered.colMetadata)
Convert Ensemble names to gene symbols
G_list <- ensembldb::select(EnsDb.Hsapiens.v86, keys= row.names(bulk_exp), keytype = "GENEID", columns = c("GENEID","SYMBOL"))
colnames(G_list) = c("gene","Symbol")
row.names(G_list) = G_list$gene
common_gene = as.character(G_list[G_list$Symbol %in% rownames(filtered.counts),"Symbol"])
common_gene_uniq = names(table(common_gene))[table(common_gene)==1]
common_gene_uniq_id = as.character(G_list[G_list$Symbol %in% common_gene_uniq,"gene"])
Select bulk RNA-seq and snRNA-seq from the same individuals
T_orig = bulk_exp[common_gene_uniq_id,]
row.names(T_orig) = G_list[common_gene_uniq_id,"Symbol"]
common_projid = colnames(T_orig)[colnames(T_orig) %in% unique(filtered.colMetadata$projid)]
T_orig = T_orig[,common_projid]
set.seed(1)
sample_subset = sample(common_projid,20)
train_cellID = filtered.counts[,filtered.colMetadata$projid %in% sample_subset]
train = train_cellID
Construct phenotype dataset pDataC
pDataC = filtered.colMetadata[filtered.colMetadata$projid %in% common_projid,]
pDataC = data.frame(cellID = pDataC$TAG, cellType = pDataC$broad.cell.type, sampleID = pDataC$projid)
pDataC[pDataC$cellType=="Ex" | pDataC$cellType=="In", ]$cellType = "Neu"
row.names(pDataC) = pDataC$cellID
colnames(train) = pDataC[colnames(train),]$cellType
individual = unique(pDataC$sampleID)
individual_fraction = list()
for(i in 1:length(individual)){
individual_fraction[[i]] = data.frame(individual[i],table(pDataC[pDataC$sampleID == individual[i],"cellType"])/nrow(pDataC[pDataC$sampleID == individual[i],]))
}
P = do.call(rbind.data.frame, individual_fraction)
colnames(P) = c("tissue", "CT", "expected_values")
normalization = "TMM"
train_TMM = Scaling(train, normalization)
## Repeated column names found in count matrix
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
cellType <- colnames(train_TMM)
group = list()
for(i in unique(cellType)){
group[[i]] <- which(cellType %in% i)
}
C_orig = lapply(group,function(x) Matrix::rowMeans(train_TMM[,x])) #C should be made with the mean (not sum) to agree with the way markers were selected
C = do.call(cbind.data.frame, C_orig)
refProfiles.var = lapply(group,function(x) train_TMM[,x])
refProfiles.var = lapply(refProfiles.var, function(x) matrixStats::rowSds(Matrix::as.matrix(x)))
refProfiles.var = do.call(cbind.data.frame, refProfiles.var)
rownames(refProfiles.var) <- rownames(C_orig)
T = Scaling(T_orig, normalization)
train_cellID_TMM = Scaling(train_cellID, normalization)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
marker_strategy = "all"
marker_distrib = marker_strategies(markers_symbol, marker_strategy, C)
fraction_out = data.frame()
evaluation_out = data.frame()
bulk_methods = c("CIBERSORT","nnls","FARDEEP")
for(i in bulk_methods){
RESULTS = Deconvolution(T = T, C = C, phenoDataC = pDataC, method = i, P = P, elem = to_remove, marker_distrib = marker_distrib, refProfiles.var = refProfiles.var)
RESULTS$method = i
fraction_out = rbind(fraction_out,RESULTS)
evaluation = RESULTS %>% dplyr::summarise(RMSE = sqrt(mean((observed_values-expected_values)^2)) %>% round(.,4),
Pearson=cor(observed_values,expected_values) %>% round(.,4))
evaluation$method = i
evaluation_out = rbind(evaluation_out, evaluation)
}
## Warning in data.table::melt(RESULTS): The melt generic in data.table has
## been passed a matrix and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(RESULTS). In the next version, this warning will become an error.
## Loading required package: nnls
## Warning in data.table::melt(RESULTS): The melt generic in data.table has
## been passed a matrix and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(RESULTS). In the next version, this warning will become an error.
## Loading required package: FARDEEP
sc_methods = c("MuSiC","SCDC","BisqueRNA")
for(i in sc_methods){
if(i == "BisqueRNA"){
RESULTS = Deconvolution(T = T_orig, C = train_cellID, method = i, phenoDataC = pDataC, P = P, elem = to_remove, refProfiles.var = refProfiles.var)
}else{
RESULTS = Deconvolution(T = T, C = train_cellID_TMM, method = i, phenoDataC = pDataC, P = P, elem = to_remove, refProfiles.var = refProfiles.var)
}
RESULTS$method = i
fraction_out = rbind(fraction_out,RESULTS)
evaluation = RESULTS %>% dplyr::summarise(RMSE = sqrt(mean((observed_values-expected_values)^2)) %>% round(.,4),
Pearson=cor(observed_values,expected_values) %>% round(.,4))
evaluation$method = i
evaluation_out = rbind(evaluation_out, evaluation)
}
## Loading required package: xbioc
## Loading required package: MuSiC
## Loading required package: ggplot2
## Loading required package: SCDC
## Creating Basis Matrix adjusted for maximal variance weight
## Used 17036 common genes...
## Used 7 cell types in deconvolution...
## 20149910 has common genes 15977 ...
## WNNLS Converged at iteration 8
## 20249897 has common genes 15515 ...
## WNNLS Converged at iteration 19
## 10536568 has common genes 15538 ...
## WNNLS Converged at iteration 14
## 11399321 has common genes 15644 ...
## WNNLS Converged at iteration 12
## 20179164 has common genes 15902 ...
## WNNLS Converged at iteration 2
## 20275399 has common genes 15974 ...
## WNNLS Converged at iteration 8
## 21189544 has common genes 15548 ...
## WNNLS Converged at iteration 12
## 20977678 has common genes 15801 ...
## WNNLS Converged at iteration 12
## 10222853 has common genes 15995 ...
## WNNLS Converged at iteration 8
## 11302830 has common genes 15897 ...
## WNNLS Converged at iteration 4
## 11409232 has common genes 15932 ...
## WNNLS Converged at iteration 4
## 10101327 has common genes 15944 ...
## WNNLS Converged at iteration 12
## 20956867 has common genes 15919 ...
## WNNLS Converged at iteration 6
## 11336574 has common genes 15848 ...
## WNNLS Converged at iteration 4
## 10260309 has common genes 15968 ...
## WNNLS Converged at iteration 6
## 21126823 has common genes 16067 ...
## WNNLS Converged at iteration 10
## 11345331 has common genes 15967 ...
## WNNLS Converged at iteration 4
## 11342432 has common genes 16111 ...
## WNNLS Converged at iteration 4
## 11200645 has common genes 15846 ...
## WNNLS Converged at iteration 8
## 10288185 has common genes 15918 ...
## WNNLS Converged at iteration 4
## 20104101 has common genes 15995 ...
## WNNLS Converged at iteration 6
## 20261901 has common genes 15559 ...
## WNNLS Converged at iteration 7
## 10248033 has common genes 15769 ...
## WNNLS Converged at iteration 12
## 20207013 has common genes 15943 ...
## WNNLS Converged at iteration 4
## 20173942 has common genes 15932 ...
## WNNLS Converged at iteration 8
## 20254740 has common genes 15836 ...
## WNNLS Converged at iteration 4
## 11609672 has common genes 16023 ...
## WNNLS Converged at iteration 4
## 10261026 has common genes 15954 ...
## WNNLS Converged at iteration 4
## 20112377 has common genes 15967 ...
## WNNLS Converged at iteration 6
## 21142003 has common genes 15884 ...
## WNNLS Converged at iteration 3
## 11072071 has common genes 15910 ...
## WNNLS Converged at iteration 6
## 10514454 has common genes 16062 ...
## WNNLS Converged at iteration 5
## 11399871 has common genes 15882 ...
## WNNLS Converged at iteration 1
## 20963866 has common genes 15938 ...
## WNNLS Converged at iteration 4
## 21412626 has common genes 15898 ...
## WNNLS Converged at iteration 6
## Loading required package: BisqueRNA
## Decomposing into 7 cell types.
## Using 17383 genes in both bulk and single-cell expression.
## Converting single-cell counts to CPM and filtering zero variance genes.
## Filtered 87 zero variance genes.
## Converting bulk counts to CPM and filtering unexpressed genes.
## Filtered 269 unexpressed genes.
## Generating single-cell based reference from 27904 cells.
## Inferring bulk transformation from single-cell alone.
## Applying transformation to bulk samples and decomposing.
write.table(fraction_out,file=paste0("Deconvolution_fraction.tsv"),sep="\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
write.table(evaluation_out,file=paste0("Deconvolution_evaluation.tsv"),sep="\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
write.table(sample_subset,file=paste0("Deconvolution_trainSamples.tsv"),sep="\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
library(ggplot2)
library(tidyr)
library(dplyr)
training = read.delim("Deconvolution_trainSamples.tsv")
fraction_raw = read.delim("Deconvolution_fraction.tsv")
fraction_raw = fraction_raw[fraction_raw$method != "FARDEEP" & fraction_raw$method != "nnls",]
fraction = fraction_raw %>% pivot_longer(c(observed_values,expected_values),names_to = "type",values_to = "fraction")
fraction_observed = fraction[fraction$type=="observed_values",]
fraction_expected = fraction[fraction$type=="expected_values",]
fraction_expected$method = "snRNA-seq"
fraction_all = rbind(fraction_observed,unique(fraction_expected))
fraction_all$method = gsub("BisqueRNA","Bisque", fraction_all$method)
fraction_test = fraction_all[! fraction_all$tissue %in% training$x,]
p1 = ggplot(fraction_test,aes(x=CT,y=fraction,color=method))+geom_boxplot()+
ylab("Fraction") +xlab("Cell Type")+
theme_bw()+theme(panel.border = element_rect(colour = "black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),axis.text=element_text(color="black",size=12),axis.title=element_text(color="black",size=13),legend.text=element_text(color="black",size=13),legend.title=element_text(color="black",size=13))
p1
ggsave(p1, file=paste0("Deconvolution_test.pdf"), width=7, height=4)
fraction_test_eval = fraction_raw %>% dplyr::filter(! tissue %in% training$x) %>% group_by(tissue,method) %>%
summarise(RMSE = sqrt(mean((observed_values-expected_values)^2)) %>% round(.,4),Pearson=cor(observed_values,expected_values) %>% round(.,4))
## `summarise()` regrouping output by 'tissue' (override with `.groups` argument)
fraction_test_eval_sum = fraction_test_eval %>% group_by(method) %>% summarise(RMSE_mean = mean(RMSE),Pearson_mean = mean(Pearson))
## `summarise()` ungrouping output (override with `.groups` argument)
p2 = ggplot(fraction_test_eval,aes(x=method,y=Pearson,color=method))+geom_boxplot()+
ylab("Correlation") +xlab("Method")+
theme_bw()+theme(panel.border = element_rect(colour = "black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),axis.text=element_text(color="black",size=12),axis.title=element_text(color="black",size=13),legend.text=element_text(color="black",size=13),legend.title=element_text(color="black",size=13))
p2
ggsave(p2, file=paste0("Deconvolution_test.cor.pdf"), width=4, height=3)
p3 = ggplot(fraction_test_eval,aes(x=method,y=RMSE,color=method))+geom_boxplot()+
ylab("RMSE") +xlab("Method")+
theme_bw()+theme(panel.border = element_rect(colour = "black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),axis.text=element_text(color="black",size=12),axis.title=element_text(color="black",size=13),legend.text=element_text(color="black",size=13),legend.title=element_text(color="black",size=13))
p3
ggsave(p3, file=paste0("Deconvolution_test.RMSE.pdf"), width=4, height=3)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /sc/arion/work/xup04/miniconda2_new/lib/libopenblasp-r0.3.12.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] BisqueRNA_1.0.4 SCDC_0.0.0.9000
## [3] MuSiC_0.1.1 ggplot2_3.3.2
## [5] xbioc_0.1.19 FARDEEP_1.0.1
## [7] nnls_1.4 EnsDb.Hsapiens.v86_2.99.0
## [9] ensembldb_2.10.2 AnnotationFilter_1.10.0
## [11] GenomicFeatures_1.38.2 AnnotationDbi_1.48.0
## [13] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [15] IRanges_2.20.2 S4Vectors_0.24.4
## [17] tidyr_1.1.2 dplyr_1.0.2
## [19] Biobase_2.46.0 BiocGenerics_0.32.0
## [21] Matrix_1.3-2 preprocessCore_1.48.0
## [23] e1071_1.7-4
##
## loaded via a namespace (and not attached):
## [1] backports_1.2.1 BiocFileCache_1.10.2
## [3] plyr_1.8.6 lazyeval_0.2.2
## [5] splines_3.6.3 BiocParallel_1.20.1
## [7] digest_0.6.27 foreach_1.5.1
## [9] htmltools_0.5.1.1 fansi_0.4.2
## [11] magrittr_2.0.1 checkmate_2.0.0
## [13] memoise_2.0.0 limma_3.42.2
## [15] recipes_0.1.16 Biostrings_2.54.0
## [17] gower_0.2.2 matrixStats_0.58.0
## [19] limSolve_1.5.6 MCMCpack_1.4-9
## [21] askpass_1.1 lpSolve_5.6.15
## [23] prettyunits_1.1.1 colorspace_2.0-0
## [25] blob_1.2.1 rappdirs_0.3.3
## [27] xfun_0.19 crayon_1.4.1
## [29] RCurl_1.98-1.2 survival_3.2-7
## [31] iterators_1.0.12 glue_1.4.2
## [33] registry_0.5-1 gtable_0.3.0
## [35] ipred_0.9-11 zlibbioc_1.32.0
## [37] XVector_0.26.0 MatrixModels_0.4-1
## [39] DelayedArray_0.12.3 SparseM_1.78
## [41] scales_1.1.1 pheatmap_1.0.12
## [43] DBI_1.1.0 edgeR_3.28.1
## [45] Rcpp_1.0.6 xtable_1.8-4
## [47] progress_1.2.2 bit_4.0.4
## [49] lava_1.6.9 prodlim_2019.11.13
## [51] httr_1.4.2 RColorBrewer_1.1-2
## [53] ellipsis_0.3.1 farver_2.1.0
## [55] pkgconfig_2.0.3 XML_3.99-0.3
## [57] nnet_7.3-12 dbplyr_1.4.4
## [59] locfit_1.5-9.4 utf8_1.1.4
## [61] caret_6.0-88 labeling_0.4.2
## [63] tidyselect_1.1.0 rlang_0.4.10
## [65] reshape2_1.4.4 munsell_0.5.0
## [67] tools_3.6.3 cachem_1.0.4
## [69] generics_0.1.0 RSQLite_2.2.1
## [71] evaluate_0.14 stringr_1.4.0
## [73] fastmap_1.1.0 fastmatrix_0.2-3571
## [75] mcmc_0.9-7 ModelMetrics_1.2.2.2
## [77] knitr_1.30 bit64_4.0.5
## [79] purrr_0.3.4 nlme_3.1-152
## [81] quantreg_5.75 biomaRt_2.42.1
## [83] compiler_3.6.3 curl_4.3
## [85] tibble_3.1.0 stringi_1.5.3
## [87] lattice_0.20-41 ProtGenerics_1.18.0
## [89] vctrs_0.3.6 pillar_1.5.0
## [91] lifecycle_1.0.0 BiocManager_1.30.10
## [93] data.table_1.14.0 cowplot_1.1.0
## [95] bitops_1.0-6 conquer_1.2.1
## [97] rtracklayer_1.46.0 R6_2.5.0
## [99] codetools_0.2-16 MASS_7.3-53
## [101] gtools_3.8.2 assertthat_0.2.1
## [103] SummarizedExperiment_1.16.1 openssl_1.4.3
## [105] pkgmaker_0.32.2 withr_2.4.2
## [107] GenomicAlignments_1.22.1 Rsamtools_2.2.3
## [109] GenomeInfoDbData_1.2.2 hms_0.5.3
## [111] quadprog_1.5-8 grid_3.6.3
## [113] rpart_4.1-15 timeDate_3043.102
## [115] L1pack_0.38.196 coda_0.19-4
## [117] class_7.3-15 rmarkdown_2.5
## [119] pROC_1.17.0.1 lubridate_1.7.9