This tutorial show how to run eQTL analysis in Cluster1 (Excitatory Neurons) of snRNA-seq of the ROSMAP cohort.
R version: R/4.0.2
Load necessary packages and functions into R
rm(list = ls())
library(Seurat)
library(MatrixEQTL)
outdir <- "eQTL_analysis"
if(!file.exists(outdir)) dir.create(outdir, recursive = TRUE)
dir.create("eQTL_analysis/QQPlot")
## Warning in dir.create("eQTL_analysis/QQPlot"): 'eQTL_analysis/QQPlot' already
## exists
dir.create("eQTL_analysis/Cis_eQTLs")
## Warning in dir.create("eQTL_analysis/Cis_eQTLs"): 'eQTL_analysis/Cis_eQTLs'
## already exists
dir.create("eQTL_analysis/Trans_eQTLs")
## Warning in dir.create("eQTL_analysis/Trans_eQTLs"): 'eQTL_analysis/Trans_eQTLs'
## already exists
Now, since the data files is huge, load it from cloud. A precomputed version of the SeuratObject file is available at https://www.synapse.org/#!Synapse:syn26477245 Please download the precomputed SeuratObject first.
obj=readRDS("SeuratObject.RDS")
print(obj)
## An object of class Seurat
## 35350 features across 70634 samples within 2 assays
## Active assay: SCT (17575 features, 3000 variable features)
## 1 other assay present: RNA
## 3 dimensional reductions calculated: pca, umap, tsne
Here, we extract cells of AD cases
obj_earlyAD=subset(x = obj, subset = pathology.group == "early-pathology")
obj_lateAD=subset(x = obj, subset = pathology.group == "late-pathology")
obj_AD=merge(x = obj_earlyAD, y = obj_lateAD)
data = GetAssayData( obj_AD )
meta = obj_AD@meta.data
Next, we need to read the clinical meta file:ROSMAP_clinical.csv.
It is stored in https://www.synapse.org/Portal.html#!Synapse:syn3157322 The ID key is stored in “ROSMAP_IDkey.txt” file: https://www.synapse.org/#!Synapse:syn3382527
Please download the two files before running the codes below:
clinical = read.csv('ROSMAP_clinical.csv')
samples01=unique(meta$projid)
wgs=read.table("ROSMAP_IDkey.txt",header=T,sep="\t")
overlap=intersect(wgs$projid,samples01)
wgs02=wgs[match(overlap,wgs$projid),]
write.table(wgs02,"eQTL_analysis/Samples.Matched.sc.wgs.txt",col.names=T,row.names=F,sep="\t",quote=F)
write.table(wgs02[,2],"eQTL_analysis/WGSID.Matched.sc.wgs.txt",col.names=F,row.names=F,sep="\t",quote=F)
projid2=wgs02[,1]
Calculate Cell count per cluster per individual
totalcluster=length(unique(meta$seurat_clusters))
newmatrix=matrix(data=NA,nrow=totalcluster,ncol=length(projid2))
colnames(newmatrix)=wgs02[,2]
rownames(newmatrix)=paste("Cluster",seq(0,21),sep="")
for (index in 0:(totalcluster-1)){
mycluster=meta[meta$seurat_clusters==index,]
for (i in 1:length(projid2)){
cells=mycluster[mycluster$projid==projid2[i],]
newmatrix[index+1,i]=dim(cells)[1]
}
}
write.table(newmatrix,"eQTL_analysis/CellCount.Clusters.txt",col.names=T,row.names=T,sep="\t",quote=F)
samples=matrix(data=NA,nrow=2,ncol=dim(newmatrix)[1])
rownames(samples)=c("Threshold:10","Threshold:5")
colnames(samples)=paste("Cluster",seq(0,21),sep="")
for (i in 1:dim(newmatrix)[1]){
output=t(newmatrix[i,newmatrix[i,]>=5])
mypro=wgs[match(colnames(output),wgs$wgs_id),1]
output2=rbind(output,mypro)
j=i-1
outputfile=paste("eQTL_analysis/Cluster.Sample.CellCount/Cluster",j,".SampleCellCount.txt",sep="")
write.table(output2,outputfile,col.names=T,row.names=F,sep="\t",quote=F)
wgsidoutput=paste("eQTL_analysis/Cluster.Sample.CellCount/Cluster",j,".WGSID.Used.txt",sep="")
write.table(t(t(colnames(output))),wgsidoutput,col.names=F,row.names=F,sep="\t",quote=F)
}
The normalized gene expression matrix is averaged by per gene, per cell type and per individual
totalcluster=length(unique(meta$seurat_clusters))-1
for (index in 0:totalcluster){
mycluster=meta[meta$seurat_clusters==index,]
mydata=data[,match(rownames(mycluster),colnames(data))]
individualinput=paste("eQTL_analysis/Cluster.Sample.CellCount/Cluster",index,".SampleCellCount.txt",sep="")
usedind=read.table(individualinput,header=T,sep="\t")
newmatrix=matrix(data=NA,nrow=17575,ncol=dim(usedind)[2])
rownames(newmatrix)=rownames(data)
for (i in 1:dim(usedind)[2]){
cells=mycluster[mycluster$projid==usedind[2,i],]
sample=mydata[,match(rownames(cells),colnames(mydata))]
newmatrix[,i]=Matrix::rowMeans(sample)
}
colnames(newmatrix)=colnames(usedind)
zerothreshold=0.1*dim(usedind)[2]
pass=t(colnames(usedind))
colnames(pass)=colnames(usedind)
rownames(pass)="id"
for (j in 1:dim(newmatrix)[1]){
if (length(newmatrix[j,which(newmatrix[j,]==0)]) <zerothreshold){
newrowname=c(rownames(pass),rownames(newmatrix)[j])
pass=rbind(pass,newmatrix[j,])
rownames(pass)=newrowname
}
}
output=paste("eQTL_analysis/Expression_Matrix/Cluster",index,".GeneExpression.txt",sep="")
write.table(pass,output,col.names=F,row.names=T,sep="\t",quote=F)
}
#other files for MatrixEQTL The genotype matrix file “Cluster1.GenoT_genome.txt” , the snp location file “Cluster1.snps.ROSMAP.location.txt” , covariate file “Covariates.Cluster1.txt” and gene location file “geneloc.txt” are stored in .
Please downloaded the above three files before run codes below.
Detailed explaination of each file please check the Matrix_eQTL tutorial:http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
Use Linear model here
useModel = modelLINEAR
Read in the genotype matrix and snp location files of Cluster 1.
cluster=1
SNP_file_name = paste("Cluster",cluster,".GenoT_genome.txt", sep="");
snps_location_file_name = paste("Cluster",cluster,".snps.ROSMAP.location.txt", sep="");
Read in the averaged gene expression matrix and gene location of Cluster 1.
expression_file_name = paste("eQTL_analysis", "/Expression_Matrix/Cluster",cluster,".GeneExpression.txt", sep="");
gene_location_file_name = "geneloc.txt";
Read in the covariates of Cluster 1.
covariates_file_name = paste("Covariates.Cluster",cluster,".txt", sep="");
Run the eQTL analysis using Matrix_eQTL:
output_file_name_cis = paste(base.dir,"/Cis_eQTLs/Cis.eQTLs.Cluster",cluster,".genome.txt",sep="");
output_file_name_tra = paste(base.dir,"/Trans_eQTLs/Trans.eQTLs.Cluster",cluster,".genome.txt",sep="");
snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE);
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);
mygene=read.table(expression_file_name,header=TRUE,stringsAsFactors = FALSE);
pvOutputThreshold_cis = 1;
pvOutputThreshold_tra = 0.05/(as.numeric(dim(snpspos)[1])*as.numeric(dim(mygene)[1]));
errorCovariance = numeric();
cisDist = 1e6;
snps = SlicedData$new();
snps$fileDelimiter = "\t";
snps$fileOmitCharacters = "-1";
snps$fileSkipRows = 1;
snps$fileSkipColumns = 1;
snps$fileSliceSize = 2000;
snps$LoadFile(SNP_file_name);
gene = SlicedData$new();
gene$fileDelimiter = "\t";
gene$fileOmitCharacters = "NA";
gene$fileSkipRows = 1;
gene$fileSkipColumns = 1;
gene$fileSliceSize = 2000;
gene$LoadFile(expression_file_name);
cvrt = SlicedData$new();
cvrt$fileDelimiter = "\t";
cvrt$fileOmitCharacters = "NA";
cvrt$fileSkipRows = 1;
cvrt$fileSkipColumns = 1;
if(length(covariates_file_name)>0) {
cvrt$LoadFile(covariates_file_name);
}
me = Matrix_eQTL_main(
snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name_tra,
pvOutputThreshold = pvOutputThreshold_tra,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = TRUE,
output_file_name.cis = output_file_name_cis,
pvOutputThreshold.cis = pvOutputThreshold_cis,
snpspos = snpspos,
genepos = genepos,
cisDist = cisDist,
pvalue.hist = "qqplot",
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE);
Output relavant information of output
cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n');
cat('Detected local eQTLs:', '\n');
show(me$cis$eqtls)
cat('Detected distant eQTLs:', '\n');
show(me$trans$eqtls)
qqfile=paste(base.dir,"/QQPlot/QQPlot.",cluster,".genome.pdf",sep="")
pdf(qqfile)
plot(me)
dev.off()
The eQTL files are in eQTL_analysis/Cis_eQTLs and eQTL_analysis/Trans_eQTLs