Now you have completed the transcript-level quantification using Salmon. The next step is to use txiimport to aggregate the data to gene-level for downstream analysis, e.g. differential expression analysis by DESeq2.
You may need to install some of the packages.
# setwd("~/Documents/RNAseq_Bootcamp")
# getwd()
library("devtools")
## Loading required package: usethis
library("BiocStyle")
library("knitr")
library("rmarkdown")
##
## Attaching package: 'rmarkdown'
## The following objects are masked from 'package:BiocStyle':
##
## html_document, md_document, pdf_document
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("ggplot2")
library("pheatmap")
library("RColorBrewer")
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("tximport")
# To view documentation
browseVignettes("tximport")
## starting httpd help server ... done
# Load txiimport
library(tximport)
# Create a csv file containing the sample information and locate the directory containing the file
dir = "/Users/hanruizhang/Bioinformatics/Zhang_RNAseq_Bootcamp_20190710"
list.files(dir)
## [1] "gencode.v30_salmon_0.14.0" "gencode.v30.annotation.gtf.gz"
## [3] "gencode.v30.annotation.sqlite" "gencode.v30.transcripts.fa.gz"
## [5] "GSE55536_DESeq2.csv" "GSE55536_fastq"
## [7] "GSE55536_quants" "GSE55536_sample.csv"
## [9] "GSE55536.R" "multiqc_data"
## [11] "multiqc_report.html" "RNAseq_analysis_workflow.md"
## [13] "RNAseq_Bootcamp_GSE55536.html" "RNAseq_Bootcamp_GSE55536.nb.html"
## [15] "RNAseq_Bootcamp_GSE55536.Rmd"
# Read the file and check everything looks correct
coldata <- read.csv("GSE55536_sample.csv", header = TRUE)
head(coldata)
list.files(file.path(dir,"GSE55536_quants"))
## [1] "M0_HMDM_1" "M0_HMDM_2" "M0_HMDM_3" "M1_HMDM_1" "M1_HMDM_2" "M1_HMDM_3"
colnames(coldata)
## [1] "X" "id" "subject" "treatment"
idx <- c("id","subject","treatment")
coldata[,idx]
levels(coldata$id)
## [1] "M0_HMDM_1" "M0_HMDM_2" "M0_HMDM_3" "M1_HMDM_1" "M1_HMDM_2" "M1_HMDM_3"
coldata$subject <- as.factor(coldata$subject) # change numeric number of subject id to factor
levels(coldata$subject)
## [1] "1" "2" "3"
levels(coldata$treatment)
## [1] "M0" "M1"
files <- file.path(dir,"GSE55536_quants",coldata$id,"quant.sf")
names(files) <- coldata$id
all(file.exists(files)) # the output should be "TRUE"
## [1] TRUE
files
## M0_HMDM_1
## "/Users/hanruizhang/Bioinformatics/Zhang_RNAseq_Bootcamp_20190710/GSE55536_quants/M0_HMDM_1/quant.sf"
## M0_HMDM_2
## "/Users/hanruizhang/Bioinformatics/Zhang_RNAseq_Bootcamp_20190710/GSE55536_quants/M0_HMDM_2/quant.sf"
## M0_HMDM_3
## "/Users/hanruizhang/Bioinformatics/Zhang_RNAseq_Bootcamp_20190710/GSE55536_quants/M0_HMDM_3/quant.sf"
## M1_HMDM_1
## "/Users/hanruizhang/Bioinformatics/Zhang_RNAseq_Bootcamp_20190710/GSE55536_quants/M1_HMDM_1/quant.sf"
## M1_HMDM_2
## "/Users/hanruizhang/Bioinformatics/Zhang_RNAseq_Bootcamp_20190710/GSE55536_quants/M1_HMDM_2/quant.sf"
## M1_HMDM_3
## "/Users/hanruizhang/Bioinformatics/Zhang_RNAseq_Bootcamp_20190710/GSE55536_quants/M1_HMDM_3/quant.sf"
This data.frame is required because transcripts IDs in salmon need to be associated with gene IDs for gene-level summarization
## Make TxDb - saveDb to make it quicker next time
# BiocManager::install("rtracklayer")
# BiocManager::install("GenomicFeatures")
library(rtracklayer)
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unsplit, which,
## which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
library(GenomicFeatures)
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
gtf <- "gencode.v30.annotation.gtf.gz"
txdb.filename <- "gencode.v30.annotation.sqlite"
txdb <- makeTxDbFromGFF(gtf) # This step takes some time.
## Import genomic features from the file as a GRanges object ...
## OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of
## type stop_codon. This information was ignored.
## OK
saveDb(txdb, txdb.filename)
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: gencode.v30.annotation.gtf.gz
## # Organism: NA
## # Taxonomy ID: NA
## # miRBase build ID: NA
## # Genome: NA
## # transcript_nrow: 208621
## # exon_nrow: 710152
## # cds_nrow: 274588
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2019-10-15 15:05:12 -0400 (Tue, 15 Oct 2019)
## # GenomicFeatures version at creation time: 1.36.4
## # RSQLite version at creation time: 2.1.2
## # DBSCHEMAVERSION: 1.2
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- select(txdb, k, "GENEID", "TXNAME")
## 'select()' returned 1:1 mapping between keys and columns
head(tx2gene)
library("tximport")
library("jsonlite")
library("readr") # the "readr" makes tximport quicker
txi <- tximport(files, type="salmon", tx2gene=tx2gene)
## reading in files with read_tsv
## 1 2 3 4 5 6
## summarizing abundance
## summarizing counts
## summarizing length
names(txi)
## [1] "abundance" "counts" "length"
## [4] "countsFromAbundance"
txi$counts[1:3,1:3]
## M0_HMDM_1 M0_HMDM_2 M0_HMDM_3
## ENSG00000000003.14 0 0 0
## ENSG00000000005.6 0 0 0
## ENSG00000000419.12 47 31 20
txi$length[1:3,1:3]
## M0_HMDM_1 M0_HMDM_2 M0_HMDM_3
## ENSG00000000003.14 3662.2410 3662.241 3662.241
## ENSG00000000005.6 684.7500 684.750 684.750
## ENSG00000000419.12 878.6459 886.474 850.887
txi$abundance[1:3,1:3]
## M0_HMDM_1 M0_HMDM_2 M0_HMDM_3
## ENSG00000000003.14 0.00000 0.00000 0.00000
## ENSG00000000005.6 0.00000 0.00000 0.00000
## ENSG00000000419.12 79.98988 49.94774 36.35912
txi$countsFromAbundance
## [1] "no"
# Construct a DESeqDataSet (dds) using txi
library("DESeq2")
## Loading required package: SummarizedExperiment
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
dds <- DESeqDataSetFromTximport(txi, coldata, design = ~subject + treatment) # dds is now ready for DESeq() see DESeq2 vignette
## using counts and average transcript lengths from tximport
genetable <- data.frame(gene.id = rownames(txi$counts))
names(assays(dds))
## [1] "counts" "avgTxLength"
# EdgeR
library("edgeR")
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
cts <- txi$counts
normMat <- txi$length
normMat <- normMat/exp(rowMeans(log(normMat)))
o <- log(calcNormFactors(cts/normMat)) + log(colSums(cts/normMat))
dge <- DGEList(counts = cts,
samples = coldata,
genes = genetable)
dge <- scaleOffset(dge, t(t(log(normMat)) + o))
names(dge)
## [1] "counts" "samples" "genes" "offset"
# dds
nrow(dds)
## [1] 58434
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
## [1] 15940
# dge
dge <- dge[rowSums(round(dge$counts)) > 1, ]
all(rownames(dge) == rownames(dds))
## [1] TRUE
dge <- dge[filterByExpr(dge),]
library("vsn")
# Transformation with vst (for n>30)
vsd <- vst(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(vsd), 3)
## M0_HMDM_1 M0_HMDM_2 M0_HMDM_3 M1_HMDM_1 M1_HMDM_2
## ENSG00000000419.12 6.209145 5.867141 5.608826 5.794110 5.476258
## ENSG00000000457.14 4.923380 4.878699 4.840362 6.080892 5.902086
## ENSG00000000460.17 5.350176 5.474443 5.323995 4.937214 4.919341
## M1_HMDM_3
## ENSG00000000419.12 5.729253
## ENSG00000000457.14 6.215809
## ENSG00000000460.17 4.776723
meanSdPlot(assay(vsd), ranks = FALSE)
# Transformation with rlog (for n<30)
rld <- rlog(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(rld), 3)
## M0_HMDM_1 M0_HMDM_2 M0_HMDM_3 M1_HMDM_1 M1_HMDM_2
## ENSG00000000419.12 5.112337 4.692518 4.348300 4.597367 4.165171
## ENSG00000000457.14 2.953807 2.820133 2.810344 4.652513 4.421047
## ENSG00000000460.17 3.231867 3.398876 3.194962 2.672349 2.646414
## M1_HMDM_3
## ENSG00000000419.12 4.512478
## ENSG00000000457.14 4.805362
## ENSG00000000460.17 2.487057
meanSdPlot(assay(rld), ranks = FALSE)
# Plot sample distance
# Plot sample-to-sample distances using the rlog-transformed values
sampleDists <- dist(t(assay(rld)))
sampleDists
## M0_HMDM_1 M0_HMDM_2 M0_HMDM_3 M1_HMDM_1 M1_HMDM_2
## M0_HMDM_2 54.78074
## M0_HMDM_3 59.58941 75.12378
## M1_HMDM_1 111.37075 110.33474 122.91857
## M1_HMDM_2 120.02160 116.20741 129.00920 61.14667
## M1_HMDM_3 112.29956 112.37634 116.69185 61.94952 60.95431
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$treatment, rld$subject, sep = " - ")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
# Plot sample-to-sample distances using the Poisson distance
library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix(poisd$dd)
rownames(samplePoisDistMatrix) <- paste(vsd$treatment, vsd$subject, sep = " - ")
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
clustering_distance_rows = poisd$dd,
clustering_distance_cols = poisd$dd,
col = colors)
# PCA plot
### PCA plot using DESeq2
plotPCA(vsd, intgroup = c("treatment", "subject"))
### PCA plot using qqplot
pcaData <- plotPCA(vsd, intgroup = c("treatment", "subject"), returnData = TRUE)
pcaData
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = treatment, shape = subject)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed()
# MDS plot
# MDS plot from the VST data
mds <- as.data.frame(colData(vsd)) %>%
cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = treatment, shape = subject)) +
geom_point(size = 3) + coord_fixed()
# MDS plot with poisson distribution
mdsPois <- as.data.frame(colData(dds)) %>%
cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = treatment, shape = subject)) +
geom_point(size = 3) + coord_fixed()
# Differential expression (DE) analysis
dds <- DESeq(dds)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
head(res)
## log2 fold change (MLE): treatment M1 vs M0
## Wald test p-value: treatment M1 vs M0
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000000419.12 24.7985738722766 -0.457154423198636 0.637998030608425
## ENSG00000000457.14 19.5760629521881 3.1207087694737 0.749272065373874
## ENSG00000000460.17 8.39290563946875 -1.69466913322939 1.01901475884969
## ENSG00000000938.13 116.053409790579 -1.5143736980187 0.4326122525949
## ENSG00000000971.15 35.9943841917436 4.01145105983981 1.70662421990445
## ENSG00000001036.13 115.962664360505 -0.0919167365618874 0.380351171423933
## stat pvalue
## <numeric> <numeric>
## ENSG00000000419.12 -0.716545194916467 0.473654771291156
## ENSG00000000457.14 4.16498747743453 3.11369654741656e-05
## ENSG00000000460.17 -1.66304670124936 0.0963030875514567
## ENSG00000000938.13 -3.50053353536606 0.000464327812874363
## ENSG00000000971.15 2.35051806546165 0.0187472975925626
## ENSG00000001036.13 -0.241662819698374 0.809041441297499
## padj
## <numeric>
## ENSG00000000419.12 0.714941486997797
## ENSG00000000457.14 0.000391499267830781
## ENSG00000000460.17 0.267680171310452
## ENSG00000000938.13 0.00397072083277613
## ENSG00000000971.15 0.0817347395918228
## ENSG00000001036.13 0.914251343483738
res <- results(dds, contrast = c("treatment", "M0", "M1"))
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized counts for all samples
## log2FoldChange results log2 fold change (MLE): treatment M0 vs M1
## lfcSE results standard error: treatment M0 vs M1
## stat results Wald statistic: treatment M0 vs M1
## pvalue results Wald test p-value: treatment M0 vs M1
## padj results BH adjusted p-values
summary(res)
##
## out of 15940 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1248, 7.8%
## LFC < 0 (down) : 1594, 10%
## outliers [1] : 0, 0%
## low counts [2] : 4327, 27%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
##
## FALSE TRUE
## 9303 2310
resLFC1 <- results(dds, lfcThreshold = 1)
table(resLFC1$padj < 0.05)
##
## FALSE TRUE
## 10615 689
dim(resLFC1)
## [1] 15940 6
sum(res$pvalue < 0.05, na.rm=TRUE)
## [1] 3472
sum(!is.na(res$pvalue))
## [1] 15940
sum(res$padj < 0.1, na.rm=TRUE)
## [1] 2842
resSig <- subset(res, padj < 0.1)
head(resSig[ order(resSig$log2FoldChange), ])
## log2 fold change (MLE): treatment M0 vs M1
## Wald test p-value: treatment M0 vs M1
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000128271.22 129.11167807625 -10.5331563244342 1.24692381029415
## ENSG00000172724.12 429.584688891353 -10.4705674716919 1.39605472067509
## ENSG00000110436.13 97.804574961149 -10.2116187648473 1.25426808759635
## ENSG00000166016.6 99.0987984959861 -10.1661467564108 1.25452298017705
## ENSG00000126353.3 824.105004786337 -10.0381053242188 0.776846924798231
## ENSG00000197272.2 82.6671087129431 -9.89129103393588 1.26131000756679
## stat pvalue
## <numeric> <numeric>
## ENSG00000128271.22 -8.44731349058884 2.98082527814157e-17
## ENSG00000172724.12 -7.50011250750163 6.37630806976776e-14
## ENSG00000110436.13 -8.14149611700365 3.90423063987907e-16
## ENSG00000166016.6 -8.10359548373997 5.33583134435151e-16
## ENSG00000126353.3 -12.9216001296857 3.39996398476153e-38
## ENSG00000197272.2 -7.84207766100048 4.43152136561998e-15
## padj
## <numeric>
## ENSG00000128271.22 3.06339150044762e-15
## ENSG00000172724.12 4.02435139207679e-12
## ENSG00000110436.13 3.46105575732181e-14
## ENSG00000166016.6 4.69431889408743e-14
## ENSG00000126353.3 2.46773635968973e-35
## ENSG00000197272.2 3.38574063282532e-13
head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): treatment M0 vs M1
## Wald test p-value: treatment M0 vs M1
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000178726.6 76.4924616404605 9.52767595564549 1.26336222011302
## ENSG00000124491.15 182.177927333263 9.52416292440525 1.16823634432668
## ENSG00000153012.12 38.9173565842749 8.20528058937533 1.31018661405471
## ENSG00000171659.15 62.7960740661657 8.06575790043976 1.26579636270717
## ENSG00000153823.18 25.6898689408037 7.8593491553705 1.34609217098699
## ENSG00000282608.1 23.4213886009629 7.811551598063 1.35621993935391
## stat pvalue
## <numeric> <numeric>
## ENSG00000178726.6 7.541523566213 4.64512106585242e-14
## ENSG00000124491.15 8.15259940392849 3.5618395904094e-16
## ENSG00000153012.12 6.26268082833025 3.78414655890105e-10
## ENSG00000171659.15 6.3720817487494 1.86479435068881e-10
## ENSG00000153823.18 5.83864116051417 5.26282822418129e-09
## ENSG00000282608.1 5.75979704426435 8.42151338649086e-09
## padj
## <numeric>
## ENSG00000178726.6 2.98031994131183e-12
## ENSG00000124491.15 3.1818187048788e-14
## ENSG00000153012.12 1.33167557540963e-08
## ENSG00000171659.15 6.96329800467818e-09
## ENSG00000153823.18 1.51655643095328e-07
## ENSG00000282608.1 2.36229553037001e-07
topGene <- rownames(res)[which.min(res$padj)]
plotCounts(dds, gene = topGene, intgroup=c("treatment"))
library("ggbeeswarm")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("treatment","subject"),
returnData = TRUE)
ggplot(geneCounts, aes(x = treatment, y = count, color = subject)) +
scale_y_log10() + geom_beeswarm(cex = 3)
# Clustering by the top variable genes
library("genefilter")
##
## Attaching package: 'genefilter'
## The following objects are masked from 'package:matrixStats':
##
## rowSds, rowVars
## The following object is masked from 'package:readr':
##
## spec
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20)
mat <- assay(vsd)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c("subject","treatment")])
pheatmap(mat, annotation_col = anno)
# Annotating and exporting results
# Ordered the results
resOrdered <- res[order(res$pvalue),]
head(resOrdered)
## log2 fold change (MLE): treatment M0 vs M1
## Wald test p-value: treatment M0 vs M1
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000131203.13 2021.57176312256 -9.00664225967249 0.467407974560547
## ENSG00000026751.17 2512.29357163371 -5.18496805783348 0.295683583817159
## ENSG00000123610.5 1207.20705225123 -8.03728284852897 0.481444784946851
## ENSG00000122643.20 416.278336842683 -6.37751165830618 0.411694633274812
## ENSG00000023445.14 580.430347287543 -6.16926419557139 0.400417864109125
## ENSG00000120217.14 492.197957356586 -5.4793351344632 0.359844451315732
## stat pvalue
## <numeric> <numeric>
## ENSG00000131203.13 -19.269338029888 9.71709680949578e-83
## ENSG00000026751.17 -17.5355289965631 7.67322008581569e-69
## ENSG00000123610.5 -16.6940905786657 1.44707306336257e-62
## ENSG00000122643.20 -15.4908787796831 3.99788277372632e-54
## ENSG00000023445.14 -15.4070653398473 1.46725303036322e-53
## ENSG00000120217.14 -15.2269546311708 2.3425151995977e-52
## padj
## <numeric>
## ENSG00000131203.13 1.12844645248674e-78
## ENSG00000026751.17 4.45545524282888e-65
## ENSG00000123610.5 5.60161982827651e-59
## ENSG00000122643.20 1.16068531628209e-50
## ENSG00000023445.14 3.40784188832162e-50
## ENSG00000120217.14 4.53393816882135e-49
# Write csv
write.csv(res, "GSE55536_DESeq2.csv")