1 Goal

2 Download the data

mkdir data 
cd data
curl -O  https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz
tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz 
cd ..

3 Read Data & Create a Seurat Object

There are 2,700 single cells that were sequenced on the Illumina NextSeq 500.

library(Seurat)
library(dplyr)
library(Matrix)
library(knitr)
library(kableExtra)
library(scater)

pbmc.data <- Read10X(data.dir = "~/Dropbox/SingleCellAnalysis/testanalysis/filtered_gene_bc_matrices/hg19/")
dense.size <- object.size(x = as.matrix(x = pbmc.data))
dense.size
## 709548272 bytes
## Seurat object keeps the data in sparse matrix form
sparse.size <- object.size(x = pbmc.data)
sparse.size
## 38998720 bytes
# Let’s examine the sparse counts matrix
# The columns are indexed by 10x cell barcodes (each 16 nt long), 
# and the rows are the gene names. We mentioned these matrices are sparse, here we see only zeroes (indicated by the “.” symbol); this is the most common value in these sparse matrices. 
pbmc.data[105:110, 1:10] 
## 6 x 10 sparse Matrix of class "dgTMatrix"
##                                  
## RP11-181G12.2 . . . . . . . . . .
## C1orf86       . . . . . . . . 1 .
## AL590822.2    . . . . . . . . . .
## AL590822.1    . . . . . . . . . .
## RP11-181G12.5 . . . . . . . . . .
## RP11-181G12.4 . . . . . . . . . .
# Loading the data #
# Filter 1 :keep genes detected in atleast 3 cells
# Filter 2: Keep Cells atleast 200 genes
# CreateSeurat object imposes basic minimum gene - cutoff

# report number of genes (rows) and number of cells (columns)
dim(pbmc.data) 
## [1] 32738  2700
# Look at the summary counts for genes and cells
counts_per_cell <- Matrix::colSums(pbmc.data) ## Total Counts in each cell(columns)
counts_per_gene <- Matrix::rowSums(pbmc.data)## Total Counts per each gene(rows)
genes_per_cell <- Matrix::colSums(pbmc.data>0) # count gene only if it has non-zero reads mapped.
cells_per_gene <- Matrix::rowSums(pbmc.data>0) # only count cells where the gene is expressed


hist(log10(counts_per_cell+1),main='counts per cell',col='wheat')

hist(log10(genes_per_cell+1), main='genes per cell', col='wheat')

plot(counts_per_cell, genes_per_cell, log='xy', col='brown')
title('counts vs genes per cell')

hist(log10(counts_per_gene+1), main='counts per gene', col='wheat')

# we rank each cell by its library complexity, ie the number of genes detected per cell. 
# This is a very useful plot as it shows the distribution of library complexity in the sequencing run. One can use this plot to investigate observations (potential cells) that are actually failed libraries (lower end outliers) or observations that are cell doublets (higher end outliers).
## Distibution of number of genes per cell
plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)')

# # our analysis will be on the single object, of class Seurat. This object contains various “slots” (designated by seurat@slotname)
# that will store not only the raw count data, but also the results from various computations below. This has the advantage that we do not need to keep track of inidividual variables of interest - they can all be collapsed into a single object as long as these slots are pre-defined.

pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, 
                           project = "10X_PBMC")
pbmc
## An object of class seurat in project 10X_PBMC 
##  13714 genes across 2700 samples.
# seurat@raw.data is a slot that stores the original gene count matrix. 
# We can view the first 10 rows (genes) and the first 10 columns (cells).
pbmc @raw.data[105:110, 1:10] 
## 6 x 10 sparse Matrix of class "dgTMatrix"
##                                  
## SPSB1         . . . . . . . . . .
## SLC25A33      . . . . . . . . . .
## TMEM201       . . . . . . . . . .
## PIK3CD        . . . . . . . . . .
## C1orf200      . . . . . . . . . .
## RP11-558F24.4 . . . . . . . . . .

4 Quality Control : Filter Cells bases on technical parameters

Seurat object initialization step above only considered cells that expressed at least 200 genes. Additionally, we would like to exclude cells that are damaged. A common metric to judge this (although by no means the only one) is the relative expression of mitochondrially derived genes. When the cells apoptose due to stress, their mitochondria becomes leaky and there is widespread RNA degradation. Thus a relative enrichment of mitochondrially derived genes can be a tell-tale sign of cell stress. Here, we compute the proportion of transcripts that are of mitochondrial origin for every cell (percent.mito), and visualize its distribution as a violin plot

# The number of genes and UMIs (nGene and nUMI) are automatically calculated
# for every object by Seurat.We calculate the percentage of
# mitochondrial genes here and store it in percent.mito using AddMetaData.
# We use object@raw.data since this represents non-transformed and
# non-log-normalized counts The % of UMI mapping to MT-genes is a common
# scRNA-seq QC metric.
mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
mito.genes %>% kable() %>% kable_styling()
x
MT-ND1
MT-ND2
MT-CO1
MT-CO2
MT-ATP8
MT-ATP6
MT-CO3
MT-ND3
MT-ND4L
MT-ND4
MT-ND5
MT-ND6
MT-CYB
percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)
# AddMetaData adds columns to object@meta.data, and is a great place to
# stash QC stats
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
head(pbmc@meta.data)
nrow(pbmc@meta.data)## No. of Cells
## [1] 2700
VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)

# GenePlot is typically used to visualize gene-gene relationships, but can
# be used for anything calculated by the object, i.e. columns in
# object@meta.data, PC scores etc.  Since there is a rare subset of cells
# with an outlier level of high mitochondrial percentage and also low UMI
# content, we filter these as well

par(mfrow = c(1, 2))
GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "percent.mito")
GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "nGene")

# Load the the list of house keeping genes
hkgenes <- read.table("housekeeping_genes.txt")
hkgenes <- as.vector(hkgenes$V1) %>% kable() %>% kable_styling()


# remove hkgenes that were not found
hkgenes.found <- which(toupper(rownames(pbmc@data)) %in% hkgenes)
n.expressed.hkgenes <- Matrix::colSums(pbmc@data[hkgenes.found, ] > 0)
pbmc <- AddMetaData(object = pbmc, metadata = n.expressed.hkgenes, col.name = "n.exp.hkgenes")
VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito","n.exp.hkgenes"), nCol = 4,  point.size.use = 0.1)
## Warning in SingleVlnPlot(feature = x, data = data.use[, x, drop = FALSE], :
## All cells have the same value of feature.

5 Filter Cells and genes

Let’s filter the cells based on the quality control metrics. Filter based on: 1. nGene 2. percent.mito
Change the thresholds to what you think they should be according to the violin plots

# We filter out cells that have unique gene counts over 2,500 or less than
# 200 Note that low.thresholds and high.thresholds are used to define a
# 'gate'.  -Inf and Inf should be used if you don't want a lower or upper
# threshold.

pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), 
                    low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))

# Number of cells remaining 
pbmc
## An object of class seurat in project 10X_PBMC 
##  13714 genes across 2638 samples.

6 Normalization

After removing unwanted cells from the dataset, the next step is to normalize the data. By default, we employ a global-scaling normalization method “LogNormalize” that normalizes the gene expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. This is the simplest and the most intuitive

pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 
                      scale.factor = 10000)

7 Highly Variable Genes

# Seurat calculates highly variable genes and focuses on these for downstream analysis. 
# FindVariableGenes calculates the average expression and dispersion for each gene, 
# places these genes into bins, and then calculates a z-score for dispersion within each bin
# The parameters here identify ~1900 variable genes

pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, 
                          x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

length(x = pbmc@var.genes)
## [1] 1838

8 Scale Data

# Your single cell dataset likely contains ‘uninteresting’ sources of variation. This could include not only technical noise, 
# but batch effects, or even biological sources of variation (cell cycle stage). 
# As suggested in Buettner et al, NBT, 2015, regressing these signals out of the analysis can 
# improve downstream dimensionality reduction and clustering. To mitigate the effect of these signals,
# Seurat constructs linear models to predict gene expression based on user-defined variables. 
# The scaled z-scored residuals of these models are stored in the scale.data slot, 
# and are used for dimensionality reduction and clustering.
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
## Regressing out: nUMI, percent.mito
## 
## Time Elapsed:  10.4188928604126 secs
## Scaling data matrix

9 PCA on the Scaled Data

# we perform PCA on the scaled data. By default, the genes in object@var.genes are used as input,
# but can be defined using pc.genes.

pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5, 
               genes.print = 5)
## [1] "PC1"
## [1] "CST3"   "TYROBP" "FCN1"   "LST1"   "AIF1"  
## [1] ""
## [1] "PTPRCAP" "IL32"    "LTB"     "CD2"     "CTSW"   
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "NKG7" "GZMB" "PRF1" "CST7" "GZMA"
## [1] ""
## [1] "CD79A"    "MS4A1"    "HLA-DQA1" "TCL1A"    "HLA-DQB1"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "PF4"   "PPBP"  "SDPR"  "SPARC" "GNG11"
## [1] ""
## [1] "CYBA"     "HLA-DPA1" "HLA-DPB1" "HLA-DRB1" "CD37"    
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "IL32"   "GIMAP7" "AQP3"   "FYB"    "MAL"   
## [1] ""
## [1] "CD79A"    "HLA-DQA1" "CD79B"    "MS4A1"    "HLA-DQB1"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "FCER1A"  "LGALS2"  "MS4A6A"  "S100A8"  "CLEC10A"
## [1] ""
## [1] "FCGR3A"        "CTD-2006K23.1" "IFITM3"        "ABI3"         
## [5] "CEBPB"        
## [1] ""
## [1] ""
PrintPCA(object = pbmc, pcs.print = 1:5, genes.print = 5, use.full = FALSE)
## [1] "PC1"
## [1] "CST3"   "TYROBP" "FCN1"   "LST1"   "AIF1"  
## [1] ""
## [1] "PTPRCAP" "IL32"    "LTB"     "CD2"     "CTSW"   
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "NKG7" "GZMB" "PRF1" "CST7" "GZMA"
## [1] ""
## [1] "CD79A"    "MS4A1"    "HLA-DQA1" "TCL1A"    "HLA-DQB1"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "PF4"   "PPBP"  "SDPR"  "SPARC" "GNG11"
## [1] ""
## [1] "CYBA"     "HLA-DPA1" "HLA-DPB1" "HLA-DRB1" "CD37"    
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "IL32"   "GIMAP7" "AQP3"   "FYB"    "MAL"   
## [1] ""
## [1] "CD79A"    "HLA-DQA1" "CD79B"    "MS4A1"    "HLA-DQB1"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "FCER1A"  "LGALS2"  "MS4A6A"  "S100A8"  "CLEC10A"
## [1] ""
## [1] "FCGR3A"        "CTD-2006K23.1" "IFITM3"        "ABI3"         
## [5] "CEBPB"        
## [1] ""
## [1] ""
VizPCA(object = pbmc, pcs.use = 1:2)

PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)

pbmc <- ProjectPCA(object = pbmc, do.print = FALSE)
# PCHeatmap allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and genes are ordered according to their PCA scores. 
# Setting cells.use to a number plots the ‘extreme’ cells on both ends of the spectrum, 
PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced = TRUE, label.columns = FALSE)
## Warning in heatmap.2(data.use, Rowv = NA, Colv = NA, trace = "none", col =
## col.use, : Discrepancy: Rowv is FALSE, while dendrogram is `both'. Omitting
## row dendogram.
## Warning in heatmap.2(data.use, Rowv = NA, Colv = NA, trace = "none", col
## = col.use, : Discrepancy: Colv is FALSE, while dendrogram is `column'.
## Omitting column dendogram.
## Warning in plot.window(...): "dimTitle" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "dimTitle" is not a graphical parameter
## Warning in title(...): "dimTitle" is not a graphical parameter

PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE, 
          label.columns = FALSE, use.full = FALSE)

10 Determine statistically significant principal components

# Determining how many PCs to include downstream is therefore an important step.
# We identify ‘significant’ PCs as those who have a strong enrichment of low p-value genes.
pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
# ‘Significant’ PCs will show a strong enrichment of genes with low p-values
# (solid curve above the dashed line). In this case it appears that PCs 1-10 are significant.


JackStrawPlot(object = pbmc, PCs = 1:12)
## Warning: Removed 17469 rows containing missing values (geom_point).

## An object of class seurat in project 10X_PBMC 
##  13714 genes across 2638 samples.

11 Cluster the cells

# Importantly, the distance metric which drives the clustering analysis
# (based on previously identified PCs) remains the same.
# The FindClusters function implements the procedure, and contains a resolution parameter
# that sets the ‘granularity’ of the downstream clustering, with increased values leading
# to a greater number of clusters. We find that setting this parameter between
# 0.6-1.2 typically returns good results for single cell datasets of around 3K cells.
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, 
                     resolution = 0.6, print.output = 0, save.SNN = TRUE)
PrintFindClustersParams(object = pbmc)
## Parameters used in latest FindClusters calculation run on: 2019-07-30 22:49:38
## =============================================================================
## Resolution: 0.6
## -----------------------------------------------------------------------------
## Modularity Function    Algorithm         n.start         n.iter
##      1                   1                 100             10
## -----------------------------------------------------------------------------
## Reduction used          k.param          prune.SNN
##      pca                 30                0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
# Non-linear dimensional reduction (tSNE)

pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)
TSNEPlot(object = pbmc)

12 Finding differentially expressed genes (cluster biomarkers)

# Seurat can help you find markers that define clusters via differential expression. 
# By default, it identifes positive and negative markers of a single cluster 
# (specified in ident.1), compared to all other cells. FindAllMarkers automates this 
# process for all clusters, but you can also test groups of clusters vs. each other,
# or against all cells.

# The min.pct argument requires a gene to be detected at a minimum percentage in 
# either of the two groups of cells, and the thresh.test argument requires a gene to be 
# differentially expressed (on average) by some amount between the two groups.
# find all markers of cluster 1
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
print(x = head(x = cluster1.markers, n = 5)) %>% kable() %>% kable_styling()
##               p_val avg_logFC pct.1 pct.2     p_val_adj
## S100A9  0.00000e+00  3.830954 0.996 0.216  0.000000e+00
## S100A8  0.00000e+00  3.786963 0.973 0.122  0.000000e+00
## LGALS2  0.00000e+00  2.639056 0.908 0.060  0.000000e+00
## FCN1    0.00000e+00  2.367232 0.954 0.151  0.000000e+00
## CD14   5.05283e-289  1.947447 0.662 0.029 6.929451e-285
p_val avg_logFC pct.1 pct.2 p_val_adj
S100A9 0 3.830954 0.996 0.216 0
S100A8 0 3.786963 0.973 0.122 0
LGALS2 0 2.639056 0.908 0.060 0
FCN1 0 2.367232 0.954 0.151 0
CD14 0 1.947447 0.662 0.029 0
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 5, ident.2 = c(0, 3), 
                                min.pct = 0.25)
print(x = head(x = cluster5.markers, n = 5)) %>% kable() %>% kable_styling()
##                p_val avg_logFC pct.1 pct.2     p_val_adj
## GZMB   2.280513e-190  3.195731 0.955 0.084 3.127496e-186
## IGFBP7 1.859718e-155  2.175973 0.542 0.010 2.550417e-151
## GNLY   5.014639e-155  3.515681 0.961 0.143 6.877075e-151
## FGFBP2 5.082837e-150  2.558487 0.852 0.086 6.970603e-146
## SPON2  4.814899e-141  2.191047 0.716 0.053 6.603153e-137
p_val avg_logFC pct.1 pct.2 p_val_adj
GZMB 0 3.195731 0.955 0.084 0
IGFBP7 0 2.175973 0.542 0.010 0
GNLY 0 3.515681 0.961 0.143 0
FGFBP2 0 2.558487 0.852 0.086 0
SPON2 0 2.191047 0.716 0.053 0
# find markers for every cluster compared to all remaining cells, report
# only the positive ones

pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, 
                               thresh.use = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
# VlnPlot (shows expression probability distributions across clusters)

VlnPlot(object = pbmc, features.plot = c("MS4A1", "CD79A"))

VlnPlot(object = pbmc, features.plot = c("LGALS2"))

VlnPlot(object = pbmc, features.plot = c("GZMB"))

# FeaturePlot (visualizes gene expression on a tSNE or PCA plot) 
FeaturePlot(object = pbmc, features.plot = c("MS4A1", "GNLY", "CD3E", "CD14", 
                                             "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"), cols.use = c("grey", "blue"), 
            reduction.use = "tsne")

# DoHeatmap generates an expression heatmap for given cells and genes. In this case, 
# we are plotting the top 10 markers (or all markers if less than 10) for each cluster.
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
top10%>% kable() %>% kable_styling()
p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
0 1.1488046 0.924 0.482 0 0 LDHB
0 1.0446491 0.870 0.249 0 0 CD3D
0 0.9718669 0.769 0.266 0 0 CD3E
0 0.8951618 0.932 0.527 0 0 LTB
0 1.0676981 0.662 0.202 0 0 IL7R
0 0.8957091 0.648 0.260 0 0 NOSIP
0 0.9062039 0.369 0.063 0 0 CCR7
0 0.8529238 0.318 0.047 0 0 LEF1
0 0.9090779 0.272 0.027 0 0 MAL
0 0.8898080 0.315 0.053 0 0 AQP3
0 3.8309543 0.996 0.216 0 1 S100A9
0 3.7869632 0.973 0.122 0 1 S100A8
0 2.6390560 0.908 0.060 0 1 LGALS2
0 2.3672324 0.954 0.151 0 1 FCN1
0 1.9474468 0.662 0.029 0 1 CD14
0 2.1113572 0.994 0.265 0 1 TYROBP
0 1.9974617 0.685 0.042 0 1 MS4A6A
0 2.0646875 0.992 0.266 0 1 CST3
0 3.1209043 1.000 0.517 0 1 LYZ
0 1.8080101 1.000 0.987 0 1 FTL
0 2.9778519 0.936 0.042 0 2 CD79A
0 2.3376466 0.860 0.053 0 2 MS4A1
0 2.4006436 0.915 0.142 0 2 CD79B
0 1.9609087 0.564 0.010 0 2 LINC00926
0 2.4860187 0.623 0.022 0 2 TCL1A
0 2.1266067 0.895 0.118 0 2 HLA-DQA1
0 1.6778656 0.491 0.007 0 2 VPREB3
0 2.1392801 0.868 0.147 0 2 HLA-DQB1
0 2.0279420 1.000 0.821 0 2 CD74
0 1.9183262 1.000 0.495 0 2 HLA-DRA
0 2.1588123 0.974 0.230 0 3 CCL5
0 1.5777208 0.955 0.212 0 3 NKG7
0 2.1134275 0.588 0.050 0 3 GZMK
0 1.4472338 0.773 0.120 0 3 CST7
0 1.4012978 0.782 0.120 0 3 GZMA
0 1.3492329 0.503 0.067 0 3 CD8A
0 1.2989566 0.821 0.230 0 3 CTSW
0 1.2782477 0.484 0.066 0 3 KLRG1
0 1.1214810 0.565 0.132 0 3 LYAR
0 1.8373033 0.403 0.060 0 3 GZMH
0 1.6394699 0.822 0.071 0 4 RP11-290F20.3
0 1.7844033 0.803 0.075 0 4 MS4A7
0 2.2786520 0.962 0.137 0 4 FCGR3A
0 1.8708592 0.975 0.179 0 4 IFITM3
0 2.1541191 1.000 0.316 0 4 LST1
0 1.5856303 0.854 0.146 0 4 RHOC
0 1.5977308 0.949 0.202 0 4 SERPINA1
0 1.9140734 1.000 0.318 0 4 FCER1G
0 1.8445657 1.000 0.335 0 4 AIF1
0 1.5883947 0.917 0.292 0 4 TIMP1
0 3.3346342 0.955 0.068 0 5 GZMB
0 2.7625225 0.852 0.064 0 5 FGFBP2
0 2.2657977 0.716 0.040 0 5 SPON2
0 2.7251334 0.935 0.108 0 5 PRF1
0 3.7639282 0.961 0.131 0 5 GNLY
0 2.1971642 0.935 0.151 0 5 GZMA
0 2.2330868 0.942 0.150 0 5 CST7
0 2.7735014 1.000 0.255 0 5 NKG7
0 2.5130649 0.703 0.077 0 5 CCL4
0 2.0255379 0.987 0.257 0 5 CTSW
0 2.7292430 0.844 0.011 0 6 FCER1A
0 1.8267447 0.750 0.015 0 6 CLEC10A
0 1.8925708 0.969 0.209 0 6 HLA-DQA1
0 1.6861465 0.938 0.232 0 6 HLA-DQB1
0 1.8398150 1.000 0.487 0 6 HLA-DPA1
0 1.6533064 1.000 0.343 0 6 HLA-DRB5
0 1.7637805 1.000 0.391 0 6 CST3
0 1.8089142 1.000 0.453 0 6 HLA-DRB1
0 1.9651684 1.000 0.513 0 6 HLA-DPB1
0 1.7345520 1.000 0.555 0 6 HLA-DRA
0 3.5492219 0.929 0.001 0 7 GP9
0 3.5115581 0.929 0.008 0 7 SPARC
0 5.0207262 1.000 0.010 0 7 PF4
0 4.5430618 0.929 0.010 0 7 GNG11
0 4.3043974 0.929 0.011 0 7 SDPR
0 4.2275982 0.857 0.015 0 7 CLU
0 5.9443347 1.000 0.024 0 7 PPBP
0 4.0226305 0.786 0.015 0 7 TUBB1
0 3.9637176 0.857 0.025 0 7 CD9
0 3.9215151 0.929 0.031 0 7 HIST1H2AC
DoHeatmap(object = pbmc, genes.use = top10$gene, slim.col.label = TRUE, remove.key = TRUE)

13 Assigning cell type identity to clusters

current.cluster.ids <- c(0, 1, 2, 3, 4, 5, 6, 7)
new.cluster.ids <- c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", 
                     "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object = pbmc, do.label = TRUE, pt.size = 0.5)

14 Converting to/from SingleCellExperiment

# download from satija lab
# https://www.dropbox.com/s/kwd3kcxkmpzqg6w/pbmc3k_final.rds?dl=0
pbmc <- readRDS("~/Downloads/pbmc3k_final.rds")
pbmc_sce <- Convert(from = pbmc, to = "sce")
p1 <- plotExpression(object = pbmc_sce, features = "MS4A1", x = "ident") + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1))
p2 <- plotPCA(object = pbmc_sce, colour_by = "ident")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
plot_grid(p1, p2)