Bulk RNA-seq deconvolution and performance evaluation by scRNA-seq dataset

Author: Peng Xu

Date: 11/20/2021

Note: helper_functions codes were adopted from “https://github.com/favilaco/deconv_benchmark/Master_deconvolution.R”. Nature Communications; https://doi.org/10.1038/s41467-020-19015-1.

source('CIBERSORT.R')
source('helper_functions.R')
library(Matrix)
library(Biobase)

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following object is masked from 'package:Matrix':
## 
##     which

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:Biobase':
## 
##     combine

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack

library("EnsDb.Hsapiens.v86")

## Loading required package: ensembldb

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:Matrix':
## 
##     expand

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## Loading required package: GenomeInfoDb

## Loading required package: GenomicFeatures

## Loading required package: AnnotationDbi

## 
## Attaching package: 'AnnotationDbi'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: AnnotationFilter

## 
## Attaching package: 'ensembldb'

## The following object is masked from 'package:dplyr':
## 
##     filter

## The following object is masked from 'package:stats':
## 
##     filter

Input dataset from Cluster

bulk: bulk RNA-seq count matrix; filtered.counts: snRNA-seq count matrix; filtered.colMetadata: meta file for snRNA-seq; markers

bulk = readRDS("counts.geneID.sinai_pipeline.RDS")
filtered.counts <- readMM("filtered_count_matrix.mtx")
rownames(filtered.counts) <- readLines("filtered_gene_row_names.txt")
filtered.colMetadata <- read.delim("filtered_column_metadata.txt")
markers_symbol = read.delim("markers_symbol.tsv")

##Preprocessing data matrix

bulk_exp = exprs(bulk)
meta = phenoData(bulk)@data
colnames(bulk_exp) = meta[colnames(bulk_exp),"projid"]
row.names(filtered.colMetadata) = filtered.colMetadata[,1]
colnames(filtered.counts) = row.names(filtered.colMetadata)

Convert Ensemble names to gene symbols

G_list <- ensembldb::select(EnsDb.Hsapiens.v86, keys= row.names(bulk_exp), keytype = "GENEID", columns = c("GENEID","SYMBOL"))
colnames(G_list) = c("gene","Symbol")
row.names(G_list) = G_list$gene
common_gene = as.character(G_list[G_list$Symbol %in% rownames(filtered.counts),"Symbol"])
common_gene_uniq = names(table(common_gene))[table(common_gene)==1]
common_gene_uniq_id = as.character(G_list[G_list$Symbol %in% common_gene_uniq,"gene"])

Select bulk RNA-seq and snRNA-seq from the same individuals

T_orig = bulk_exp[common_gene_uniq_id,]
row.names(T_orig) = G_list[common_gene_uniq_id,"Symbol"]
common_projid = colnames(T_orig)[colnames(T_orig) %in% unique(filtered.colMetadata$projid)]
T_orig = T_orig[,common_projid]

Build training set

set.seed(1)
sample_subset = sample(common_projid,20)
train_cellID = filtered.counts[,filtered.colMetadata$projid %in% sample_subset]
train = train_cellID

Construct phenotype dataset pDataC

pDataC = filtered.colMetadata[filtered.colMetadata$projid %in% common_projid,]
pDataC = data.frame(cellID = pDataC$TAG, cellType =  pDataC$broad.cell.type, sampleID = pDataC$projid)
pDataC[pDataC$cellType=="Ex" | pDataC$cellType=="In", ]$cellType = "Neu"
row.names(pDataC) = pDataC$cellID
colnames(train) = pDataC[colnames(train),]$cellType

Generate P matrix of cell proportions based on single cell

individual = unique(pDataC$sampleID)
individual_fraction = list()
for(i in 1:length(individual)){
  individual_fraction[[i]] = data.frame(individual[i],table(pDataC[pDataC$sampleID == individual[i],"cellType"])/nrow(pDataC[pDataC$sampleID == individual[i],]))
}
P = do.call(rbind.data.frame, individual_fraction)
colnames(P) = c("tissue", "CT", "expected_values")

Generate C matrix of cell-type expressions as the reference for bulk RNAseq based methods

normalization = "TMM"
train_TMM = Scaling(train, normalization)

## Repeated column names found in count matrix

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

cellType <- colnames(train_TMM)
group = list()
for(i in unique(cellType)){ 
  group[[i]] <- which(cellType %in% i)
}
C_orig = lapply(group,function(x) Matrix::rowMeans(train_TMM[,x])) #C should be made with the mean (not sum) to agree with the way markers were selected
C = do.call(cbind.data.frame, C_orig)

Generate variation matrix of cell-type expressions as the reference for bulk RNAseq based methods

refProfiles.var = lapply(group,function(x) train_TMM[,x])
refProfiles.var = lapply(refProfiles.var, function(x) matrixStats::rowSds(Matrix::as.matrix(x)))
refProfiles.var = do.call(cbind.data.frame, refProfiles.var)
rownames(refProfiles.var) <- rownames(C_orig)

Transformation, scaling/normalization, and marker selection

T = Scaling(T_orig, normalization)
train_cellID_TMM =  Scaling(train_cellID, normalization)

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

marker_strategy = "all"
marker_distrib = marker_strategies(markers_symbol, marker_strategy, C)

Deconvolution and evaluation for bulk RNA-seq methods

fraction_out = data.frame()
evaluation_out = data.frame()
bulk_methods = c("CIBERSORT","nnls","FARDEEP")
for(i in bulk_methods){
    RESULTS = Deconvolution(T = T, C = C, phenoDataC = pDataC, method = i, P = P, elem = to_remove, marker_distrib = marker_distrib, refProfiles.var = refProfiles.var) 
    RESULTS$method = i
    fraction_out = rbind(fraction_out,RESULTS)
    evaluation = RESULTS %>% dplyr::summarise(RMSE = sqrt(mean((observed_values-expected_values)^2)) %>% round(.,4), 
                                              Pearson=cor(observed_values,expected_values) %>% round(.,4))
    evaluation$method = i
    evaluation_out = rbind(evaluation_out, evaluation)
}

## Warning in data.table::melt(RESULTS): The melt generic in data.table has
## been passed a matrix and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(RESULTS). In the next version, this warning will become an error.

## Loading required package: nnls

## Warning in data.table::melt(RESULTS): The melt generic in data.table has
## been passed a matrix and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(RESULTS). In the next version, this warning will become an error.

## Loading required package: FARDEEP

Deconvolution and evaluation for single-cell methods

sc_methods = c("MuSiC","SCDC","BisqueRNA")
for(i in sc_methods){
    if(i == "BisqueRNA"){
      RESULTS = Deconvolution(T = T_orig, C = train_cellID, method = i, phenoDataC = pDataC, P = P, elem = to_remove, refProfiles.var = refProfiles.var)
    }else{
      RESULTS = Deconvolution(T = T, C = train_cellID_TMM, method = i, phenoDataC = pDataC, P = P, elem = to_remove, refProfiles.var = refProfiles.var) 
    }
    RESULTS$method = i
    fraction_out = rbind(fraction_out,RESULTS)
    evaluation = RESULTS %>% dplyr::summarise(RMSE = sqrt(mean((observed_values-expected_values)^2)) %>% round(.,4), 
                                              Pearson=cor(observed_values,expected_values) %>% round(.,4))
    evaluation$method = i
    evaluation_out = rbind(evaluation_out, evaluation)
}

## Loading required package: xbioc

## Loading required package: MuSiC

## Loading required package: ggplot2

## Loading required package: SCDC

## Creating Basis Matrix adjusted for maximal variance weight

## Used 17036 common genes...

## Used 7 cell types in deconvolution...

## 20149910 has common genes 15977 ...

## WNNLS Converged at iteration 8

## 20249897 has common genes 15515 ...

## WNNLS Converged at iteration 19

## 10536568 has common genes 15538 ...

## WNNLS Converged at iteration 14

## 11399321 has common genes 15644 ...

## WNNLS Converged at iteration 12

## 20179164 has common genes 15902 ...

## WNNLS Converged at iteration 2

## 20275399 has common genes 15974 ...

## WNNLS Converged at iteration 8

## 21189544 has common genes 15548 ...

## WNNLS Converged at iteration 12

## 20977678 has common genes 15801 ...

## WNNLS Converged at iteration 12

## 10222853 has common genes 15995 ...

## WNNLS Converged at iteration 8

## 11302830 has common genes 15897 ...

## WNNLS Converged at iteration 4

## 11409232 has common genes 15932 ...

## WNNLS Converged at iteration 4

## 10101327 has common genes 15944 ...

## WNNLS Converged at iteration 12

## 20956867 has common genes 15919 ...

## WNNLS Converged at iteration 6

## 11336574 has common genes 15848 ...

## WNNLS Converged at iteration 4

## 10260309 has common genes 15968 ...

## WNNLS Converged at iteration 6

## 21126823 has common genes 16067 ...

## WNNLS Converged at iteration 10

## 11345331 has common genes 15967 ...

## WNNLS Converged at iteration 4

## 11342432 has common genes 16111 ...

## WNNLS Converged at iteration 4

## 11200645 has common genes 15846 ...

## WNNLS Converged at iteration 8

## 10288185 has common genes 15918 ...

## WNNLS Converged at iteration 4

## 20104101 has common genes 15995 ...

## WNNLS Converged at iteration 6

## 20261901 has common genes 15559 ...

## WNNLS Converged at iteration 7

## 10248033 has common genes 15769 ...

## WNNLS Converged at iteration 12

## 20207013 has common genes 15943 ...

## WNNLS Converged at iteration 4

## 20173942 has common genes 15932 ...

## WNNLS Converged at iteration 8

## 20254740 has common genes 15836 ...

## WNNLS Converged at iteration 4

## 11609672 has common genes 16023 ...

## WNNLS Converged at iteration 4

## 10261026 has common genes 15954 ...

## WNNLS Converged at iteration 4

## 20112377 has common genes 15967 ...

## WNNLS Converged at iteration 6

## 21142003 has common genes 15884 ...

## WNNLS Converged at iteration 3

## 11072071 has common genes 15910 ...

## WNNLS Converged at iteration 6

## 10514454 has common genes 16062 ...

## WNNLS Converged at iteration 5

## 11399871 has common genes 15882 ...

## WNNLS Converged at iteration 1

## 20963866 has common genes 15938 ...

## WNNLS Converged at iteration 4

## 21412626 has common genes 15898 ...

## WNNLS Converged at iteration 6

## Loading required package: BisqueRNA

## Decomposing into 7 cell types.

## Using 17383 genes in both bulk and single-cell expression.

## Converting single-cell counts to CPM and filtering zero variance genes.

## Filtered 87 zero variance genes.

## Converting bulk counts to CPM and filtering unexpressed genes.

## Filtered 269 unexpressed genes.

## Generating single-cell based reference from 27904 cells.

## Inferring bulk transformation from single-cell alone.

## Applying transformation to bulk samples and decomposing.

write.table(fraction_out,file=paste0("Deconvolution_fraction.tsv"),sep="\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
write.table(evaluation_out,file=paste0("Deconvolution_evaluation.tsv"),sep="\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
write.table(sample_subset,file=paste0("Deconvolution_trainSamples.tsv"),sep="\t",quote = FALSE,row.names = FALSE,col.names = TRUE)

Plot figures

library(ggplot2)
library(tidyr)
library(dplyr)

Prepare inpute files

training = read.delim("Deconvolution_trainSamples.tsv")
fraction_raw = read.delim("Deconvolution_fraction.tsv")
fraction_raw = fraction_raw[fraction_raw$method != "FARDEEP" & fraction_raw$method != "nnls",]
fraction = fraction_raw %>% pivot_longer(c(observed_values,expected_values),names_to = "type",values_to = "fraction")
fraction_observed = fraction[fraction$type=="observed_values",]
fraction_expected = fraction[fraction$type=="expected_values",]
fraction_expected$method = "snRNA-seq"
fraction_all = rbind(fraction_observed,unique(fraction_expected))
fraction_all$method = gsub("BisqueRNA","Bisque", fraction_all$method)

Performance of different deconvolution methods

fraction_test = fraction_all[! fraction_all$tissue %in% training$x,]
p1 = ggplot(fraction_test,aes(x=CT,y=fraction,color=method))+geom_boxplot()+
  ylab("Fraction") +xlab("Cell Type")+
  theme_bw()+theme(panel.border = element_rect(colour = "black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),axis.text=element_text(color="black",size=12),axis.title=element_text(color="black",size=13),legend.text=element_text(color="black",size=13),legend.title=element_text(color="black",size=13))
p1

ggsave(p1, file=paste0("Deconvolution_test.pdf"), width=7, height=4)

Evaluation of different deconvolution methods

fraction_test_eval = fraction_raw %>% dplyr::filter(! tissue %in% training$x) %>% group_by(tissue,method) %>% 
  summarise(RMSE = sqrt(mean((observed_values-expected_values)^2)) %>% round(.,4),Pearson=cor(observed_values,expected_values) %>% round(.,4))

## `summarise()` regrouping output by 'tissue' (override with `.groups` argument)

fraction_test_eval_sum  = fraction_test_eval %>% group_by(method) %>% summarise(RMSE_mean = mean(RMSE),Pearson_mean = mean(Pearson))

## `summarise()` ungrouping output (override with `.groups` argument)

p2 = ggplot(fraction_test_eval,aes(x=method,y=Pearson,color=method))+geom_boxplot()+
  ylab("Correlation") +xlab("Method")+
  theme_bw()+theme(panel.border = element_rect(colour = "black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),axis.text=element_text(color="black",size=12),axis.title=element_text(color="black",size=13),legend.text=element_text(color="black",size=13),legend.title=element_text(color="black",size=13))
p2

ggsave(p2, file=paste0("Deconvolution_test.cor.pdf"), width=4, height=3)

p3 = ggplot(fraction_test_eval,aes(x=method,y=RMSE,color=method))+geom_boxplot()+
  ylab("RMSE") +xlab("Method")+
  theme_bw()+theme(panel.border = element_rect(colour = "black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),axis.text=element_text(color="black",size=12),axis.title=element_text(color="black",size=13),legend.text=element_text(color="black",size=13),legend.title=element_text(color="black",size=13))
p3

ggsave(p3, file=paste0("Deconvolution_test.RMSE.pdf"), width=4, height=3)

Session info

sessionInfo()

## R version 3.6.3 (2020-02-29)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /sc/arion/work/xup04/miniconda2_new/lib/libopenblasp-r0.3.12.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] BisqueRNA_1.0.4           SCDC_0.0.0.9000          
##  [3] MuSiC_0.1.1               ggplot2_3.3.2            
##  [5] xbioc_0.1.19              FARDEEP_1.0.1            
##  [7] nnls_1.4                  EnsDb.Hsapiens.v86_2.99.0
##  [9] ensembldb_2.10.2          AnnotationFilter_1.10.0  
## [11] GenomicFeatures_1.38.2    AnnotationDbi_1.48.0     
## [13] GenomicRanges_1.38.0      GenomeInfoDb_1.22.1      
## [15] IRanges_2.20.2            S4Vectors_0.24.4         
## [17] tidyr_1.1.2               dplyr_1.0.2              
## [19] Biobase_2.46.0            BiocGenerics_0.32.0      
## [21] Matrix_1.3-2              preprocessCore_1.48.0    
## [23] e1071_1.7-4              
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.2.1             BiocFileCache_1.10.2       
##   [3] plyr_1.8.6                  lazyeval_0.2.2             
##   [5] splines_3.6.3               BiocParallel_1.20.1        
##   [7] digest_0.6.27               foreach_1.5.1              
##   [9] htmltools_0.5.1.1           fansi_0.4.2                
##  [11] magrittr_2.0.1              checkmate_2.0.0            
##  [13] memoise_2.0.0               limma_3.42.2               
##  [15] recipes_0.1.16              Biostrings_2.54.0          
##  [17] gower_0.2.2                 matrixStats_0.58.0         
##  [19] limSolve_1.5.6              MCMCpack_1.4-9             
##  [21] askpass_1.1                 lpSolve_5.6.15             
##  [23] prettyunits_1.1.1           colorspace_2.0-0           
##  [25] blob_1.2.1                  rappdirs_0.3.3             
##  [27] xfun_0.19                   crayon_1.4.1               
##  [29] RCurl_1.98-1.2              survival_3.2-7             
##  [31] iterators_1.0.12            glue_1.4.2                 
##  [33] registry_0.5-1              gtable_0.3.0               
##  [35] ipred_0.9-11                zlibbioc_1.32.0            
##  [37] XVector_0.26.0              MatrixModels_0.4-1         
##  [39] DelayedArray_0.12.3         SparseM_1.78               
##  [41] scales_1.1.1                pheatmap_1.0.12            
##  [43] DBI_1.1.0                   edgeR_3.28.1               
##  [45] Rcpp_1.0.6                  xtable_1.8-4               
##  [47] progress_1.2.2              bit_4.0.4                  
##  [49] lava_1.6.9                  prodlim_2019.11.13         
##  [51] httr_1.4.2                  RColorBrewer_1.1-2         
##  [53] ellipsis_0.3.1              farver_2.1.0               
##  [55] pkgconfig_2.0.3             XML_3.99-0.3               
##  [57] nnet_7.3-12                 dbplyr_1.4.4               
##  [59] locfit_1.5-9.4              utf8_1.1.4                 
##  [61] caret_6.0-88                labeling_0.4.2             
##  [63] tidyselect_1.1.0            rlang_0.4.10               
##  [65] reshape2_1.4.4              munsell_0.5.0              
##  [67] tools_3.6.3                 cachem_1.0.4               
##  [69] generics_0.1.0              RSQLite_2.2.1              
##  [71] evaluate_0.14               stringr_1.4.0              
##  [73] fastmap_1.1.0               fastmatrix_0.2-3571        
##  [75] mcmc_0.9-7                  ModelMetrics_1.2.2.2       
##  [77] knitr_1.30                  bit64_4.0.5                
##  [79] purrr_0.3.4                 nlme_3.1-152               
##  [81] quantreg_5.75               biomaRt_2.42.1             
##  [83] compiler_3.6.3              curl_4.3                   
##  [85] tibble_3.1.0                stringi_1.5.3              
##  [87] lattice_0.20-41             ProtGenerics_1.18.0        
##  [89] vctrs_0.3.6                 pillar_1.5.0               
##  [91] lifecycle_1.0.0             BiocManager_1.30.10        
##  [93] data.table_1.14.0           cowplot_1.1.0              
##  [95] bitops_1.0-6                conquer_1.2.1              
##  [97] rtracklayer_1.46.0          R6_2.5.0                   
##  [99] codetools_0.2-16            MASS_7.3-53                
## [101] gtools_3.8.2                assertthat_0.2.1           
## [103] SummarizedExperiment_1.16.1 openssl_1.4.3              
## [105] pkgmaker_0.32.2             withr_2.4.2                
## [107] GenomicAlignments_1.22.1    Rsamtools_2.2.3            
## [109] GenomeInfoDbData_1.2.2      hms_0.5.3                  
## [111] quadprog_1.5-8              grid_3.6.3                 
## [113] rpart_4.1-15                timeDate_3043.102          
## [115] L1pack_0.38.196             coda_0.19-4                
## [117] class_7.3-15                rmarkdown_2.5              
## [119] pROC_1.17.0.1               lubridate_1.7.9