# R Options
options(stringsAsFactors=FALSE,
        dplyr.summarise.inform=FALSE, 
        knitr.table.format="html",
        kableExtra_view_html=TRUE,
        future.globals.maxSize=2000000000, mc.cores=1, 
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

# Required libraries
library(Seurat)
library(celldex)
library(clustifyrdatahub)
library(SingleR)
library(clustifyr)
library(pheatmap)
library(patchwork)
library(magrittr)
library(ggplot2)

# Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source='fold-hide',      # by default collapse code blocks
                      dev=c('png', 'pdf'),           # create figures in png and pdf; the first device (png) will be used for HTML output
                      dev.args=list(png=list(type="cairo"),  # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
                                    pdf=list(bg="white")),     # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
                      dpi=96,                        # figure resolution
                      fig.retina=2                   # retina multiplier
)
# Load Seurat S4 object; output of scrnaseq script https://github.com/ktrns/scrnaseq
load("/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/scrnaseq.RData")


# Load reference datasets

# Reference dataset obtained from celldex for singleR
# https://bioconductor.org/packages/3.14/data/experiment/vignettes/celldex/inst/doc/userguide.html
# Paste reference dataset name
ref = HumanPrimaryCellAtlasData()
ref_name = "HumanPrimaryCellAtlasData"

# Reference from clustifyrdatahub
# List https://rnabioco.github.io/clustifyrdata/articles/download_refs.html
# Paste reference dataset name
ref2 = clustifyrdatahub::ref_hema_microarray()
ref2_name = "ref_hema_microarray"

Dataset description

Single cell transcriptomes can be difficult to annotate without extensive knowledge of the underlying biology. Given a reference dataset (of samples from single-cell or bulk RNA sequencing) with known labels, it is possible to assign labels to the cells from a test dataset based on similarity to that reference. Hence, the biological knowledge (defined marker genes and cluster identities) can be propagated from the reference dataset to the test dataset in an automated manner and aid in cluster identification.

Here, we performed cell type annotation for the samples sample1, sample2 of the project pbmc_small (from publicly available single cell RNA-seq data) with reference datasets obtained from databases. We use the S4 class object generated in the Main-Report “Single-cell RNA-seq data analysis” and convert it to a SingleCellExperiment class object. We use two different Tools (SingleR and Clustifyr) and reference datasets for the cluster annotation.

# Convert to SingleCellExperiment object
sce = as.SingleCellExperiment(sc)

# Add gene_short_name to cds object
rowData(sce)$gene_short_name = rownames(sce)

sc
## An object of class Seurat 
## 2092 features across 416 samples within 1 assay 
## Active assay: RNA (2092 features, 2092 variable features)
##  2 dimensional reductions calculated: pca, umap
sce
## class: SingleCellExperiment 
## dim: 2092 416 
## metadata(0):
## assays(3): counts logcounts scaledata
## rownames(2092): FAM41C B3GALT6 ... MT-CO2 MT-ND4L
## rowData names(1): gene_short_name
## colnames(416): GCTTCACCAACACGAG-1 TAGTGCACAGAGGAAA-1 ...
##   TTCCTTCAGGTAGTCG-2 TCGATTTGTACGCTTA-2
## colData names(16): orig.ident nCount_RNA ... RNA_snn_res.0.8 ident
## reducedDimNames(2): PCA UMAP
## mainExpName: RNA
## altExpNames(0):



Annotation with SingleR

Cells or clusters were annotated with reference dataset HumanPrimaryCellAtlasData obtained from celldex (https://bioconductor.org/packages/3.14/data/experiment/vignettes/celldex/inst/doc/userguide.html). The celldex package provides access to several reference datasets (mostly derived from bulk RNA-seq or microarray data).

# Annotate cells and clusters using SingleR with reference dataset form celldex
sce_ann_cells = SingleR(test = sce, ref = ref, assay.type.test = 1, labels = ref$label.fine)
sce_ann_clusters = SingleR(test = sce, ref = ref, assay.type.test = 1, labels = ref$label.fine, clusters = sce$seurat_clusters)

annotated_cells = table(sce_ann_cells$labels)
annotated_clusters = table(sce_ann_clusters$labels)

sc[["SingleR.labels"]] = sce_ann_cells$labels
sc[["SingleR.cluster.labels"]] = sce_ann_clusters$labels[match(sc[[]][["seurat_clusters"]], rownames(sce_ann_clusters))]

Annotation of single cells

UMAPs display cells colored by Seurat clusters and cell types annotated by SingleR. Annotation was performed for each cell. The annotation of each cell is more sensitive, but also more prone to artefacts compared to the annotation of clusters as performed in later steps. Here, we perform annotation of single cells for annotation diagnostics, that means for assessment of the reliability of cell type annotation and how close all cells resemble the cell types of the reference dataset.

# Visualization of singleR annotation of cells
p1 = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) + 
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  AddStyle(title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2") +
  theme(title = element_text(size = 10)) +
  NoGrid() + 
  NoLegend()
p1 = LabelClusters(p1, id="seurat_clusters")

p2 = Seurat::DimPlot(sc, reduction="umap", group.by="SingleR.labels", pt.size=param$pt_size) + 
  AddStyle(title="Cell types \n(annotation per cell)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p = p1 + p2
p

knitr::kable(annotated_cells, align="l", caption="Cell types assigned to cells") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) 
Cell types assigned to cells
Var1 Freq
B_cell 3
B_cell:immature 24
B_cell:Memory 8
B_cell:Naive 35
B_cell:Plasma_cell 3
CMP 1
GMP 1
Macrophage:monocyte-derived:M-CSF 1
Monocyte 6
Monocyte:CD14+ 14
Monocyte:CD16- 99
Monocyte:CD16+ 22
NK_cell 15
NK_cell:IL2 4
Pre-B_cell_CD34- 1
T_cell:CD4+_central_memory 76
T_cell:CD4+_effector_memory 24
T_cell:CD4+_Naive 54
T_cell:CD8+ 23
T_cell:CD8+_Central_memory 1
T_cell:gamma-delta 1


Annotation diagnostics

Heatmap displays the scores for all cells across all reference labels, allowing to assess the confidence of the corresponding predicted labels. Ideally, each cell should have one score that is distinctively higher than the rest, indicating that an unambiguous assignment.

# Annotation diagnostics
# https://bioconductor.org/packages/devel/bioc/vignettes/SingleR/inst/doc/SingleR.html

p = plotScoreHeatmap(sce_ann_cells)
p



Plot displaying per-cell “deltas” (the difference between the score for the assigned label and the median across all labels). Low deltas indicate that the assignment is uncertain. The minimum threshold on the deltas is defined using an outlier-based approach. Yellow marked points represents outliers that fell below the threshold.

# Annotation diagnostics
p = plotDeltaDistribution(sce_ann_cells, ncol = 3)
p

number_pruned_table = table(is.na(sce_ann_cells$pruned.labels))
number_pruned_table[3]=round((number_pruned_table[1]/(number_pruned_table[1]+number_pruned_table[2])*100),2)
number_pruned_table[4]=round((number_pruned_table[2]/(number_pruned_table[1]+number_pruned_table[2])*100),2)
names(number_pruned_table) = c("assigned", "ambiguous", "% assigned", "% ambiguous")

number_pruned_table = (t(number_pruned_table))
knitr::kable(number_pruned_table, align="l", caption="Number of annotated cells") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) 
Number of annotated cells
assigned ambiguous % assigned % ambiguous
412 4 99.04 0.96


Annotation of clusters

UMAPs display cells colored by Seurat clusters and cell types annotated by SingleR. Annotation was performed for each cluster as unit.

# Visualization of singleR annotation of clusters
p1 = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) + 
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  AddStyle(title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2") +
  theme(title = element_text(size = 10)) +
  NoGrid() + 
  NoLegend()
p1 = LabelClusters(p1, id="seurat_clusters")

p2 = Seurat::DimPlot(sc, reduction="umap", group.by="SingleR.cluster.labels", pt.size=param$pt_size) + 
  AddStyle(title="Cell types \n(annotation per cluster)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p = p1 + p2
p

knitr::kable(annotated_clusters, align="l", caption="Cell types assigned to clusters") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Cell types assigned to clusters
Var1 Freq
B_cell:immature 1
Monocyte:CD16- 1
T_cell:CD4+_central_memory 1
T_cell:CD8+ 1
# Visualization of singleR annotation of clusters separately per sample
p = Seurat::DimPlot(sc, reduction="umap", group.by="SingleR.cluster.labels", split.by = "orig.ident", pt.size=param$pt_size) + 
  AddStyle(title="Cell types \n(annotation per cluster)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
  theme(title = element_text(size = 10)) +
  NoGrid()
p




Annotation with Clustifyr

Cells or clusters were annotated with reference dataset ref_hema_microarray obtained from clustifyrdatahub (https://rnabioco.github.io/clustifyrdata/articles/download_refs.html). Clustifyrdata provides 42 external data sets for cell-type assignment with Clustifyr. ## Annotation of clusters UMAPs display cells colored by Seurat clusters and cell types annotated by Clustifyr. Annotation was performed for each cluster as unit.

# Annotate clusters using Clustifyr with reference dataset form clustifyrdatahub
sc = clustify(sc, ref_mat = ref2, cluster_col = "seurat_clusters", query_genes = VariableFeatures(sc))

# Visualization of Clustifyr annotation of clusters
p1 = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) + 
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  AddStyle(title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2") +
  theme(title = element_text(size = 10)) +
  NoGrid() + 
  NoLegend()
p1 = LabelClusters(p1, id="seurat_clusters")

p2 = Seurat::DimPlot(sc, reduction="umap", group.by="type", pt.size=param$pt_size) + 
  AddStyle(title="Cell types \n(annotation per cluster)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
  theme(title = element_text(size = 10)) +
  NoGrid()
p = p1 + p2
p

annotated_clusters = table(sc$type)
knitr::kable(annotated_clusters, align="l", caption="Cell types assigned to cells") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Cell types assigned to cells
Var1 Freq
CD4+ Central Memory 126
CD8+ Effector Memory 71
Monocyte 143
Naïve B-cells 76
# Visualization of Clustifyr annotation of cells separately per sample
p = Seurat::DimPlot(sc, reduction="umap", group.by="type", split.by = "orig.ident", pt.size=param$pt_size) + 
  AddStyle(title="Cell types \n(annotation per cell)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
  theme(title = element_text(size = 10)) +
  NoGrid()
p



Parameters and software versions

The following parameters were used to run the workflow.

out = ScrnaseqParamsInfo(params=param)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")
Name Value
project_id pbmc_small (from publicly available single cell RNA-seq data)
path_data name:sample1, sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample1/, /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample2/; stats:NA, NA; suffix:-1, -2
assay_raw RNA
path_out /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results/
mart_dataset hsapiens_gene_ensembl
annot_version 98
annot_main ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession
mart_attributes ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description
mt ^MT-
cell_filter sample1:nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15); sample2:nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15)
feature_filter sample1:min_counts=1, min_cells=5; sample2:min_counts=1, min_cells=5
samples_min_cells 10
norm RNA
cc_remove FALSE
cc_remove_all FALSE
cc_rescore_after_merge FALSE
integrate_samples method:merge
pc_n 10
cluster_resolution 0.8
cluster_resolution_test 0.4, 0.6, 1.2
marker_padj 0.05
marker_log2FC 1
marker_pct 0.25
enrichr_site Enrichr
enrichr_padj 0.05
enrichr_dbs GO_Biological_Process_2021, KEGG_2021_Human, WikiPathway_2021_Human
col palevioletred
col_palette_samples ggsci::pal_rickandmorty
col_palette_clusters ggsci::pal_igv
pt_size 1
path_to_git .
debugging_mode default_debugging
file_annot /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results//annotation/hsapiens_gene_ensembl.v98.annot.txt
file_cc_genes /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results//annotation/cell_cycle_markers.xlsx
downsample_cells_n
biomart_mirror www
samples_to_drop
col_samples sample1=#FAFD7CFF, sample2=#82491EFF
col_clusters 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF

This report was created with generated using the scrnaseq_add_condition_comparison script. Software versions were collected at run time.

out = ScrnaseqSessionInfo(param$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Name Version
ktrns/scrnaseq Unknown
R R version 4.1.2 (2021-11-01)
Platform x86_64-pc-linux-gnu (64-bit)
Operating system Ubuntu 20.04.3 LTS
Packages abind1.4-5, AnnotationDbi1.54.1, AnnotationHub3.0.1, assertthat0.2.1, beachmat2.8.1, Biobase2.52.0, BiocFileCache2.0.0, BiocGenerics0.38.0, BiocManager1.30.16, BiocNeighbors1.10.0, BiocParallel1.26.2, BiocSingular1.8.1, BiocVersion3.13.1, Biostrings2.60.2, bit4.0.4, bit644.0.5, bitops1.0-7, blob1.2.2, bslib0.3.0, cachem1.0.6, celldex1.2.0, cli3.0.1, cluster2.1.2, clustifyr1.4.0, clustifyrdatahub0.99.4, codetools0.2-18, colorspace2.0-2, cowplot1.1.1, crayon1.4.1, curl4.3.2, data.table1.14.2, DBI1.1.1, dbplyr2.1.1, DelayedArray0.18.0, DelayedMatrixStats1.14.3, deldir0.2-10, digest0.6.28, dplyr1.0.7, ellipsis0.3.2, entropy1.3.1, evaluate0.14, ExperimentHub2.0.0, fansi0.5.0, farver2.1.0, fastmap1.1.0, fastmatch1.1-3, fgsea1.18.0, filelock1.0.2, fitdistrplus1.1-6, future1.22.1, future.apply1.8.1, generics0.1.0, GenomeInfoDb1.28.4, GenomeInfoDbData1.2.6, GenomicRanges1.44.0, ggplot23.3.5, ggrepel0.9.1, ggridges0.5.3, globals0.14.0, glue1.4.2, goftest1.2-2, gridExtra2.3, gtable0.3.0, highr0.9, htmltools0.5.2, htmlwidgets1.5.4, httpuv1.6.3, httr1.4.2, ica1.0-2, igraph1.2.6, interactiveDisplayBase1.30.0, IRanges2.26.0, irlba2.3.3, jquerylib0.1.4, jsonlite1.7.2, kableExtra1.3.4, KEGGREST1.32.0, KernSmooth2.23-20, knitr1.36, labeling0.4.2, later1.3.0, lattice0.20-45, lazyeval0.2.2, leiden0.3.9, lifecycle1.0.1, listenv0.8.0, lmtest0.9-38, magrittr2.0.1, MASS7.3-54, Matrix1.4-0, MatrixGenerics1.4.3, matrixStats0.61.0, memoise2.0.0, mgcv1.8-38, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-152, parallelly1.28.1, patchwork1.1.1, pbapply1.5-0, pheatmap1.0.12, pillar1.6.3, pkgconfig2.0.3, plotly4.9.4.1, plyr1.8.6, png0.1-7, polyclip1.10-0, promises1.2.0.1, purrr0.3.4, R62.5.1, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-2, Rcpp1.0.7, RcppAnnoy0.0.19, RCurl1.98-1.5, reshape21.4.4, reticulate1.22, rlang0.4.11, rmarkdown2.11, ROCR1.0-11, rpart4.1-15, RSQLite2.2.8, rstudioapi0.13, rsvd1.0.5, Rtsne0.15, rvest1.0.1, S4Vectors0.30.2, sass0.4.0, ScaledMatrix1.0.0, scales1.1.1, scattermore0.7, sctransform0.3.2, sessioninfo1.1.1, Seurat4.0.4, SeuratObject4.0.2, shiny1.7.0, SingleCellExperiment1.14.1, SingleR1.6.1, sparseMatrixStats1.4.2, spatstat.core2.3-0, spatstat.data2.1-0, spatstat.geom2.2-2, spatstat.sparse2.0-0, spatstat.utils2.2-0, stringi1.7.4, stringr1.4.0, SummarizedExperiment1.22.0, survival3.2-13, svglite2.0.0, systemfonts1.0.2, tensor1.5, tibble3.1.4, tidyr1.1.4, tidyselect1.1.1, utf81.2.2, uwot0.1.10, vctrs0.3.8, viridis0.6.1, viridisLite0.4.0, webshot0.5.2, withr2.4.2, xfun0.26, xml21.3.2, xtable1.8-4, XVector0.32.0, yaml2.2.1, zlibbioc1.38.0, zoo1.8-9