First, dowload the snRNA-seq count matrix and associated meta tables into a folder, say, “Data”. In this tutorial, we will download the filtered data provided by Mathys et al from the Synapse at https://www.synapse.org/#!Synapse:syn18681734. After downloading, the Data folder should have the following files: filtered_column_metadata.txt, filtered_count_matrix.mtx, and filtered_gene_row_names.txt. Additionally, we can also download files 41586_2019_1195_MOESM3_ESM.xlsx and 41586_2019_1195_MOESM5_ESM.xlsx from the Supplementary Tables 1 &3 of the Mathys et al paper at https://www.nature.com/articles/s41586-019-1195-2#additional-information, as well as a mapping key file “snRNAseqPFC_BA10_id_mapping.csv” (https://www.synapse.org/#!Synapse:syn18694015) and a biospecimen meta file “snRNAseqPFC_BA10_biospecimen_metadata.csv” (https://www.synapse.org/#!Synapse:syn18642936) to add in more sample demographic information.
Load necessary libraries and set up input and output locations.
library(ggplot2)
library(Seurat)
library(dplyr)
library(patchwork)
library(Matrix)
library(openxlsx)
indir = 'Data/'
outdir = "Output/"
if(!file.exists(outdir)) dir.create(outdir, recursive = TRUE)
Let us read in sample meta data.
key = read.csv(paste0(indir, 'snRNAseqPFC_BA10_id_mapping.csv'), as.is = TRUE, header = TRUE)
key = unique(key[, c('projid', 'Subject')])
natsup1 = read.xlsx(paste0(indir, '41586_2019_1195_MOESM3_ESM.xlsx'), sheet = "S1")
natsup1$Diagnosis = c(YES = 'AD', NO = 'Control')[natsup1[, 'pathologic.diagnosis.of.AD']]
natsup3 = read.xlsx(paste0(indir, '41586_2019_1195_MOESM5_ESM.xlsx'), sheet = 1)
natsup = data.frame(natsup1, natsup3[match(natsup1$Subject, natsup3$Subject), !colnames(natsup3) %in% colnames(natsup1)], stringsAsFactors = FALSE)
biospecimen = read.csv(paste0(indir, 'snRNAseqPFC_BA10_biospecimen_metadata.csv'), as.is=TRUE, header = TRUE)
natsup$projid = key[match(natsup$Subject, key$Subject), 'projid']
natsup$individualID = biospecimen[match(natsup$projid, biospecimen$projid), 'individualID']
head(natsup, 5)
## Subject study age_death educ msex gpath amyloid plaq_n
## 1 ROS1 ROS 90+ 23 0 0.01718225 0 0.00000000
## 2 ROS2 ROS 90+ 24 0 0.03055274 0 0.00000000
## 3 ROS3 ROS 79.115674195756327 18 0 0.08986113 0 0.00000000
## 4 ROS4 ROS 90+ 18 0 0.03985507 0 0.03098783
## 5 ROS5 ROS 88.925393566050644 16 0 0.02080884 0 0.00000000
## cogdx pathologic.diagnosis.of.AD Diagnosis nft tangles
## 1 1 NO Control 0.02975843 0.3504917
## 2 1 NO Control 0.09166858 0.8630875
## 3 1 NO Control 0.11891828 0.2748281
## 4 4 NO Control 0.08918715 0.6956833
## 5 1 NO Control 0.06252178 0.8345100
## cogn_global_lv gpath_3neocort amyloid.group caa_4gp ceradsc braaksc
## 1 1.18041075 0.012015007 low 0 4 1
## 2 0.10671595 0.000000000 low 0 4 3
## 3 1.19261623 0.082914650 low 1 4 3
## 4 -1.70813203 0.039258059 low 3 3 2
## 5 -0.02834499 0.009967699 low 0 4 4
## niareagansc pathology.group projid individualID
## 1 3 no-pathology 20207013 R2880377
## 2 3 no-pathology 21412626 R4739508
## 3 3 no-pathology 20956867 R8725848
## 4 3 no-pathology 20104101 R4567280
## 5 3 no-pathology 21142003 R7288382
Write compiled sample meta table into a file.
write.table(natsup, paste0(outdir, "meta.samples.tsv"), quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)
Now read in single cell data.
meta = read.table(paste0(indir, 'filtered_column_metadata.txt'), as.is = TRUE, header = TRUE)
colnames(meta)[colnames(meta) == 'TAG'] = 'Barcode'
meta = data.frame(meta, natsup[match(meta$projid, natsup$projid), colnames(natsup) != 'projid'], stringsAsFactors = FALSE)
rownames(meta) = meta$Barcode
#read in count matrix
dat = readMM(paste0(indir, 'filtered_count_matrix.mtx'))
features = scan(paste0(indir, 'filtered_gene_row_names.txt'), what = character())
rownames(dat) = features
colnames(dat) = meta$Barcode
Create Seurat object from the count matrix.
SeuratObject = CreateSeuratObject(counts = dat, project = "SeuratObjectProject", min.cells = 3, min.features = 200, names.field=1, names.delim='.', meta.data=meta)
SeuratObject@meta.data$orig.ident = NULL
SeuratObject@assays$RNA@meta.features = data.frame(Geneid = rownames(SeuratObject), Symbol = rownames(SeuratObject), row.names = rownames(SeuratObject), stringsAsFactors = FALSE)
rm(dat)
Note that the mitochondrial reads has been removed by the original study from the count matrix we are using here. However to check the fraction of mitochondrial reads in a new dataset, execute a command like the following
SeuratObject = PercentageFeatureSet(SeuratObject, pattern = "^MT-", col.name = "percent.mt")
summary(SeuratObject[[]]$percent.mt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
Now we can check the data qulity by first plotting the distribution of number of features and total read counts.
meta1 = SeuratObject@meta.data
write.table(meta1, file=paste0(outdir, 'meta.seurat.raw.tsv'), quote = FALSE, sep = '\t', row.names = FALSE)
meta1 = meta1[rev(order(meta1$Diagnosis)), ]
meta1$sid = unsplit(lapply(split(meta1, meta1$Diagnosis), function(x) paste0(c(AD = 'AD', Control = 'Ct')[x$Diagnosis[1]], as.integer(factor(x$projid, level = unique(x$projid))))), meta1$Diagnosis)
meta1$sid = factor(meta1$sid, levels = unique(meta1$sid))
theme_bw2 = function() theme_bw() + theme(panel.grid = element_blank())
p1 = ggplot(meta1, aes(x = sid, y = nFeature_RNA, fill = Diagnosis)) + geom_violin(width = 0.8, scale = 'width') + theme_bw2() + theme(legend.position = 'none', axis.text.x = element_text(angle = 45, size = 6, hjust = 1), axis.text.y = element_text(size = 7)) + xlab(NULL)
p2 = ggplot(meta1, aes(x = sid, y = nCount_RNA, fill = Diagnosis)) + geom_violin(width = 0.8, scale = 'width') + theme_bw2() + theme(legend.position = 'none', axis.text.x = element_text(angle = 45, size = 6, hjust = 1), axis.text.y = element_text(size = 7)) + xlab(NULL)
print(p1 / p2)
We can also use FeatureScatter to visualize feature-feature relationships per donor.
SeuratObject@meta.data$projid = factor(SeuratObject@meta.data$projid)
p3 = FeatureScatter(SeuratObject, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", pt.size=0.1, group.by='projid') + theme(title=element_text(size=8), legend.text=element_text(size=6), legend.title=element_text(size=7))
print(p3)
Perform some simple quality control (QC) by first filtering out cells that have unique gene counts over 6,000 or less than 200. The filter on percent.mt has no actual effect here and is just used for illustration purpose.
SeuratObject
## An object of class Seurat
## 17775 features across 70634 samples within 1 assay
## Active assay: RNA (17775 features)
SeuratObject = subset(SeuratObject, subset = nFeature_RNA > 200 & nFeature_RNA < 6000 & percent.mt < 5)
SeuratObject
## An object of class Seurat
## 17775 features across 69787 samples within 1 assay
## Active assay: RNA (17775 features)
After QC, normalize the count data using Seurat’s SCTransform function. This may take a while to finish.
SeuratObject = SCTransform(SeuratObject, verbose = TRUE)
##
|
| | 0%
|
|========= | 12%
|
|================== | 25%
|
|========================== | 38%
|
|=================================== | 50%
|
|============================================ | 62%
|
|==================================================== | 75%
|
|============================================================= | 88%
|
|======================================================================| 100%
##
|
| | 0%
|
|= | 1%
|
|== | 3%
|
|=== | 4%
|
|==== | 6%
|
|===== | 7%
|
|====== | 9%
|
|======= | 10%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 25%
|
|================== | 26%
|
|=================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 38%
|
|=========================== | 39%
|
|============================ | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 48%
|
|================================== | 49%
|
|==================================== | 51%
|
|===================================== | 52%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================== | 59%
|
|=========================================== | 61%
|
|============================================ | 62%
|
|============================================= | 64%
|
|============================================== | 65%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 71%
|
|=================================================== | 72%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 99%
|
|======================================================================| 100%
##
|
| | 0%
|
|= | 1%
|
|== | 3%
|
|=== | 4%
|
|==== | 6%
|
|===== | 7%
|
|====== | 9%
|
|======= | 10%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 25%
|
|================== | 26%
|
|=================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 38%
|
|=========================== | 39%
|
|============================ | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 48%
|
|================================== | 49%
|
|==================================== | 51%
|
|===================================== | 52%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================== | 59%
|
|=========================================== | 61%
|
|============================================ | 62%
|
|============================================= | 64%
|
|============================================== | 65%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 71%
|
|=================================================== | 72%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 99%
|
|======================================================================| 100%
Extract gene-level statistics and make plots such as the detection rate.
genes = SeuratObject@assays$RNA@meta.features[rownames(SeuratObject), ]
genes = data.frame(genes, SeuratObject@assays[['SCT']]@meta.features, row.names = rownames(genes), stringsAsFactors = FALSE)
hist(genes[, 'sct.detection_rate'])
Perform dimensionality reduction by PCA.
SeuratObject = RunPCA(SeuratObject, verbose = FALSE)
#Plot PCA elbow
ElbowPlot(SeuratObject, ndims = 50)
After inspecting the elbow plot, we choose principal components 1:30 for UMAP/tSEN visualization and clustering.
SeuratObject = RunUMAP(SeuratObject, dims = 1:30, verbose = FALSE)
SeuratObject = RunTSNE(SeuratObject, dims = 1:30, verbose = FALSE)
SeuratObject = FindNeighbors(SeuratObject, verbose = FALSE)
SeuratObject = FindClusters(SeuratObject, resolution = 0.8, verbose = FALSE)
Plot UMAP.
p1 <- DimPlot(SeuratObject, reduction = "umap", label = TRUE)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
print(p1)
Save the SeuratObject into a file.
saveRDS(SeuratObject, file = paste0(outdir, 'SeuratObject.RDS'))
A precomputed version of the SeuratObject file is available at https://www.synapse.org/#!Synapse:syn26477245
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /hpc/packages/minerva-common/intel/parallel_studio_xe_2018/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
##
## locale:
## [1] C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] openxlsx_4.1.4 Matrix_1.3-2 patchwork_1.1.1 dplyr_1.0.6
## [5] Seurat_3.1.4 ggplot2_3.3.3
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 Rtsne_0.15 colorspace_2.0-1
## [4] ellipsis_0.3.2 ggridges_0.5.1 farver_2.1.0
## [7] leiden_0.3.1 listenv_0.8.0 npsurv_0.4-0
## [10] ggrepel_0.9.1 RSpectra_0.16-0 fansi_0.5.0
## [13] mvtnorm_1.0-11 codetools_0.2-16 splines_3.6.0
## [16] mnormt_1.5-5 lsei_1.2-0 knitr_1.26
## [19] TFisher_0.2.0 jsonlite_1.7.2 ica_1.0-2
## [22] cluster_2.0.8 png_0.1-7 uwot_0.1.5
## [25] sctransform_0.2.1 compiler_3.6.0 httr_1.4.1
## [28] assertthat_0.2.1 lazyeval_0.2.2 htmltools_0.5.0
## [31] tools_3.6.0 rsvd_1.0.2 igraph_1.2.6
## [34] gtable_0.3.0 glue_1.4.2 RANN_2.6.1
## [37] reshape2_1.4.4 rappdirs_0.3.3 Rcpp_1.0.6
## [40] Biobase_2.46.0 vctrs_0.3.8 multtest_2.42.0
## [43] gdata_2.18.0 ape_5.3 nlme_3.1-139
## [46] gbRd_0.4-11 lmtest_0.9-37 xfun_0.12
## [49] stringr_1.4.0 globals_0.14.0 lifecycle_1.0.0
## [52] irlba_2.3.3 gtools_3.8.1 future_1.21.0
## [55] MASS_7.3-51.4 zoo_1.8-6 scales_1.1.1
## [58] parallel_3.6.0 sandwich_2.5-1 RColorBrewer_1.1-2
## [61] yaml_2.2.1 reticulate_1.20 pbapply_1.4-3
## [64] gridExtra_2.3 stringi_1.6.2 mutoss_0.1-12
## [67] plotrix_3.7-7 caTools_1.18.0 BiocGenerics_0.32.0
## [70] zip_2.0.4 bibtex_0.4.2.1 Rdpack_0.11-1
## [73] rlang_0.4.11 pkgconfig_2.0.3 bitops_1.0-6
## [76] evaluate_0.14 lattice_0.20-38 ROCR_1.0-7
## [79] purrr_0.3.4 labeling_0.4.2 htmlwidgets_1.5.1
## [82] cowplot_1.1.1 tidyselect_1.1.1 parallelly_1.25.0
## [85] RcppAnnoy_0.0.14 plyr_1.8.6 magrittr_2.0.1
## [88] R6_2.5.0 gplots_3.0.3 generics_0.1.0
## [91] multcomp_1.4-11 DBI_1.1.0 pillar_1.6.1
## [94] withr_2.4.2 sn_1.5-4 fitdistrplus_1.0-14
## [97] survival_2.44-1.1 tibble_3.1.2 future.apply_1.7.0
## [100] tsne_0.1-3 crayon_1.4.1 KernSmooth_2.23-15
## [103] utf8_1.2.1 plotly_4.9.2.1 rmarkdown_1.12
## [106] grid_3.6.0 data.table_1.12.8 blob_1.2.0
## [109] metap_1.2 digest_0.6.27 tidyr_1.1.3
## [112] numDeriv_2016.8-1.1 RcppParallel_4.4.4 stats4_3.6.0
## [115] munsell_0.5.0 viridisLite_0.4.0