# R Options
options(stringsAsFactors=FALSE,
dplyr.summarise.inform=FALSE,
knitr.table.format="html",
kableExtra_view_html=TRUE,
future.globals.maxSize=2000000000, mc.cores=1,
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
# Required libraries
library(Seurat)
library(celldex)
library(clustifyrdatahub)
library(SingleR)
library(clustifyr)
library(pheatmap)
library(patchwork)
library(magrittr)
library(ggplot2)
# Knitr default options
::opts_chunk$set(echo=TRUE, # output code
knitrcache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source='fold-hide', # by default collapse code blocks
dev=c('png', 'pdf'), # create figures in png and pdf; the first device (png) will be used for HTML output
dev.args=list(png=list(type="cairo"), # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
pdf=list(bg="white")), # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
dpi=96, # figure resolution
fig.retina=2 # retina multiplier
)
# Load Seurat S4 object; output of scrnaseq script https://github.com/ktrns/scrnaseq
load("/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/scrnaseq.RData")
# Load reference datasets
# Reference dataset obtained from celldex for singleR
# https://bioconductor.org/packages/3.14/data/experiment/vignettes/celldex/inst/doc/userguide.html
# Paste reference dataset name
= HumanPrimaryCellAtlasData()
ref = "HumanPrimaryCellAtlasData"
ref_name
# Reference from clustifyrdatahub
# List https://rnabioco.github.io/clustifyrdata/articles/download_refs.html
# Paste reference dataset name
= clustifyrdatahub::ref_hema_microarray()
ref2 = "ref_hema_microarray" ref2_name
Single cell transcriptomes can be difficult to annotate without extensive knowledge of the underlying biology. Given a reference dataset (of samples from single-cell or bulk RNA sequencing) with known labels, it is possible to assign labels to the cells from a test dataset based on similarity to that reference. Hence, the biological knowledge (defined marker genes and cluster identities) can be propagated from the reference dataset to the test dataset in an automated manner and aid in cluster identification.
Here, we performed cell type annotation for the samples sample1, sample2 of the project pbmc_small (from publicly available single cell RNA-seq data) with reference datasets obtained from databases. We use the S4 class object generated in the Main-Report “Single-cell RNA-seq data analysis” and convert it to a SingleCellExperiment class object. We use two different Tools (SingleR and Clustifyr) and reference datasets for the cluster annotation.
# Convert to SingleCellExperiment object
= as.SingleCellExperiment(sc)
sce
# Add gene_short_name to cds object
rowData(sce)$gene_short_name = rownames(sce)
sc
## An object of class Seurat
## 2092 features across 416 samples within 1 assay
## Active assay: RNA (2092 features, 2092 variable features)
## 2 dimensional reductions calculated: pca, umap
sce
## class: SingleCellExperiment
## dim: 2092 416
## metadata(0):
## assays(3): counts logcounts scaledata
## rownames(2092): FAM41C B3GALT6 ... MT-CO2 MT-ND4L
## rowData names(1): gene_short_name
## colnames(416): GCTTCACCAACACGAG-1 TAGTGCACAGAGGAAA-1 ...
## TTCCTTCAGGTAGTCG-2 TCGATTTGTACGCTTA-2
## colData names(16): orig.ident nCount_RNA ... RNA_snn_res.0.8 ident
## reducedDimNames(2): PCA UMAP
## mainExpName: RNA
## altExpNames(0):
Cells or clusters were annotated with reference dataset HumanPrimaryCellAtlasData obtained from celldex (https://bioconductor.org/packages/3.14/data/experiment/vignettes/celldex/inst/doc/userguide.html). The celldex package provides access to several reference datasets (mostly derived from bulk RNA-seq or microarray data).
# Annotate cells and clusters using SingleR with reference dataset form celldex
= SingleR(test = sce, ref = ref, assay.type.test = 1, labels = ref$label.fine)
sce_ann_cells = SingleR(test = sce, ref = ref, assay.type.test = 1, labels = ref$label.fine, clusters = sce$seurat_clusters)
sce_ann_clusters
= table(sce_ann_cells$labels)
annotated_cells = table(sce_ann_clusters$labels)
annotated_clusters
"SingleR.labels"]] = sce_ann_cells$labels
sc[["SingleR.cluster.labels"]] = sce_ann_clusters$labels[match(sc[[]][["seurat_clusters"]], rownames(sce_ann_clusters))] sc[[
UMAPs display cells colored by Seurat clusters and cell types annotated by SingleR. Annotation was performed for each cell. The annotation of each cell is more sensitive, but also more prone to artefacts compared to the annotation of clusters as performed in later steps. Here, we perform annotation of single cells for annotation diagnostics, that means for assessment of the reliability of cell type annotation and how close all cells resemble the cell types of the reference dataset.
# Visualization of singleR annotation of cells
= Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) +
p1 scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2") +
theme(title = element_text(size = 10)) +
NoGrid() +
NoLegend()
= LabelClusters(p1, id="seurat_clusters")
p1
= Seurat::DimPlot(sc, reduction="umap", group.by="SingleR.labels", pt.size=param$pt_size) +
p2 AddStyle(title="Cell types \n(annotation per cell)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
theme(title = element_text(size = 10)) +
NoGrid()
= p1 + p2
p p
::kable(annotated_cells, align="l", caption="Cell types assigned to cells") %>%
knitr::kable_styling(bootstrap_options = c("striped", "hover")) kableExtra
Var1 | Freq |
---|---|
B_cell | 3 |
B_cell:immature | 24 |
B_cell:Memory | 8 |
B_cell:Naive | 35 |
B_cell:Plasma_cell | 3 |
CMP | 1 |
GMP | 1 |
Macrophage:monocyte-derived:M-CSF | 1 |
Monocyte | 6 |
Monocyte:CD14+ | 14 |
Monocyte:CD16- | 99 |
Monocyte:CD16+ | 22 |
NK_cell | 15 |
NK_cell:IL2 | 4 |
Pre-B_cell_CD34- | 1 |
T_cell:CD4+_central_memory | 76 |
T_cell:CD4+_effector_memory | 24 |
T_cell:CD4+_Naive | 54 |
T_cell:CD8+ | 23 |
T_cell:CD8+_Central_memory | 1 |
T_cell:gamma-delta | 1 |
Heatmap displays the scores for all cells across all reference labels, allowing to assess the confidence of the corresponding predicted labels. Ideally, each cell should have one score that is distinctively higher than the rest, indicating that an unambiguous assignment.
# Annotation diagnostics
# https://bioconductor.org/packages/devel/bioc/vignettes/SingleR/inst/doc/SingleR.html
= plotScoreHeatmap(sce_ann_cells)
p p
Plot displaying per-cell “deltas” (the difference between the score for the assigned label and the median across all labels). Low deltas indicate that the assignment is uncertain. The minimum threshold on the deltas is defined using an outlier-based approach. Yellow marked points represents outliers that fell below the threshold.
# Annotation diagnostics
= plotDeltaDistribution(sce_ann_cells, ncol = 3)
p p
= table(is.na(sce_ann_cells$pruned.labels))
number_pruned_table 3]=round((number_pruned_table[1]/(number_pruned_table[1]+number_pruned_table[2])*100),2)
number_pruned_table[4]=round((number_pruned_table[2]/(number_pruned_table[1]+number_pruned_table[2])*100),2)
number_pruned_table[names(number_pruned_table) = c("assigned", "ambiguous", "% assigned", "% ambiguous")
= (t(number_pruned_table))
number_pruned_table ::kable(number_pruned_table, align="l", caption="Number of annotated cells") %>%
knitr::kable_styling(bootstrap_options = c("striped", "hover")) kableExtra
assigned | ambiguous | % assigned | % ambiguous |
---|---|---|---|
412 | 4 | 99.04 | 0.96 |
UMAPs display cells colored by Seurat clusters and cell types annotated by SingleR. Annotation was performed for each cluster as unit.
# Visualization of singleR annotation of clusters
= Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) +
p1 scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2") +
theme(title = element_text(size = 10)) +
NoGrid() +
NoLegend()
= LabelClusters(p1, id="seurat_clusters")
p1
= Seurat::DimPlot(sc, reduction="umap", group.by="SingleR.cluster.labels", pt.size=param$pt_size) +
p2 AddStyle(title="Cell types \n(annotation per cluster)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
theme(title = element_text(size = 10)) +
NoGrid()
= p1 + p2
p p
::kable(annotated_clusters, align="l", caption="Cell types assigned to clusters") %>%
knitr::kable_styling(bootstrap_options = c("striped", "hover")) kableExtra
Var1 | Freq |
---|---|
B_cell:immature | 1 |
Monocyte:CD16- | 1 |
T_cell:CD4+_central_memory | 1 |
T_cell:CD8+ | 1 |
# Visualization of singleR annotation of clusters separately per sample
= Seurat::DimPlot(sc, reduction="umap", group.by="SingleR.cluster.labels", split.by = "orig.ident", pt.size=param$pt_size) +
p AddStyle(title="Cell types \n(annotation per cluster)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
theme(title = element_text(size = 10)) +
NoGrid()
p
Cells or clusters were annotated with reference dataset ref_hema_microarray obtained from clustifyrdatahub (https://rnabioco.github.io/clustifyrdata/articles/download_refs.html). Clustifyrdata provides 42 external data sets for cell-type assignment with Clustifyr. ## Annotation of clusters UMAPs display cells colored by Seurat clusters and cell types annotated by Clustifyr. Annotation was performed for each cluster as unit.
# Annotate clusters using Clustifyr with reference dataset form clustifyrdatahub
= clustify(sc, ref_mat = ref2, cluster_col = "seurat_clusters", query_genes = VariableFeatures(sc))
sc
# Visualization of Clustifyr annotation of clusters
= Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) +
p1 scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2") +
theme(title = element_text(size = 10)) +
NoGrid() +
NoLegend()
= LabelClusters(p1, id="seurat_clusters")
p1
= Seurat::DimPlot(sc, reduction="umap", group.by="type", pt.size=param$pt_size) +
p2 AddStyle(title="Cell types \n(annotation per cluster)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
theme(title = element_text(size = 10)) +
NoGrid()
= p1 + p2
p p
= table(sc$type)
annotated_clusters ::kable(annotated_clusters, align="l", caption="Cell types assigned to cells") %>%
knitr::kable_styling(bootstrap_options = c("striped", "hover")) kableExtra
Var1 | Freq |
---|---|
CD4+ Central Memory | 126 |
CD8+ Effector Memory | 71 |
Monocyte | 143 |
Naïve B-cells | 76 |
# Visualization of Clustifyr annotation of cells separately per sample
= Seurat::DimPlot(sc, reduction="umap", group.by="type", split.by = "orig.ident", pt.size=param$pt_size) +
p AddStyle(title="Cell types \n(annotation per cell)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
theme(title = element_text(size = 10)) +
NoGrid()
p
The following parameters were used to run the workflow.
= ScrnaseqParamsInfo(params=param)
out
::kable(out, align="l") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left") kableExtra
Name | Value |
---|---|
project_id | pbmc_small (from publicly available single cell RNA-seq data) |
path_data | name:sample1, sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample1/, /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample2/; stats:NA, NA; suffix:-1, -2 |
assay_raw | RNA |
path_out | /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results/ |
mart_dataset | hsapiens_gene_ensembl |
annot_version | 98 |
annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession |
mart_attributes | ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
mt | ^MT- |
cell_filter | sample1:nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15); sample2:nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15) |
feature_filter | sample1:min_counts=1, min_cells=5; sample2:min_counts=1, min_cells=5 |
samples_min_cells | 10 |
norm | RNA |
cc_remove | FALSE |
cc_remove_all | FALSE |
cc_rescore_after_merge | FALSE |
integrate_samples | method:merge |
pc_n | 10 |
cluster_resolution | 0.8 |
cluster_resolution_test | 0.4, 0.6, 1.2 |
marker_padj | 0.05 |
marker_log2FC | 1 |
marker_pct | 0.25 |
enrichr_site | Enrichr |
enrichr_padj | 0.05 |
enrichr_dbs | GO_Biological_Process_2021, KEGG_2021_Human, WikiPathway_2021_Human |
col | palevioletred |
col_palette_samples | ggsci::pal_rickandmorty |
col_palette_clusters | ggsci::pal_igv |
pt_size | 1 |
path_to_git | . |
debugging_mode | default_debugging |
file_annot | /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results//annotation/hsapiens_gene_ensembl.v98.annot.txt |
file_cc_genes | /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results//annotation/cell_cycle_markers.xlsx |
downsample_cells_n | |
biomart_mirror | www |
samples_to_drop | |
col_samples | sample1=#FAFD7CFF, sample2=#82491EFF |
col_clusters | 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF |
This report was created with generated using the scrnaseq_add_condition_comparison script. Software versions were collected at run time.
= ScrnaseqSessionInfo(param$path_to_git)
out
::kable(out, align="l") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover")) kableExtra
Name | Version |
---|---|
ktrns/scrnaseq | Unknown |
R | R version 4.1.2 (2021-11-01) |
Platform | x86_64-pc-linux-gnu (64-bit) |
Operating system | Ubuntu 20.04.3 LTS |
Packages | abind1.4-5, AnnotationDbi1.54.1, AnnotationHub3.0.1, assertthat0.2.1, beachmat2.8.1, Biobase2.52.0, BiocFileCache2.0.0, BiocGenerics0.38.0, BiocManager1.30.16, BiocNeighbors1.10.0, BiocParallel1.26.2, BiocSingular1.8.1, BiocVersion3.13.1, Biostrings2.60.2, bit4.0.4, bit644.0.5, bitops1.0-7, blob1.2.2, bslib0.3.0, cachem1.0.6, celldex1.2.0, cli3.0.1, cluster2.1.2, clustifyr1.4.0, clustifyrdatahub0.99.4, codetools0.2-18, colorspace2.0-2, cowplot1.1.1, crayon1.4.1, curl4.3.2, data.table1.14.2, DBI1.1.1, dbplyr2.1.1, DelayedArray0.18.0, DelayedMatrixStats1.14.3, deldir0.2-10, digest0.6.28, dplyr1.0.7, ellipsis0.3.2, entropy1.3.1, evaluate0.14, ExperimentHub2.0.0, fansi0.5.0, farver2.1.0, fastmap1.1.0, fastmatch1.1-3, fgsea1.18.0, filelock1.0.2, fitdistrplus1.1-6, future1.22.1, future.apply1.8.1, generics0.1.0, GenomeInfoDb1.28.4, GenomeInfoDbData1.2.6, GenomicRanges1.44.0, ggplot23.3.5, ggrepel0.9.1, ggridges0.5.3, globals0.14.0, glue1.4.2, goftest1.2-2, gridExtra2.3, gtable0.3.0, highr0.9, htmltools0.5.2, htmlwidgets1.5.4, httpuv1.6.3, httr1.4.2, ica1.0-2, igraph1.2.6, interactiveDisplayBase1.30.0, IRanges2.26.0, irlba2.3.3, jquerylib0.1.4, jsonlite1.7.2, kableExtra1.3.4, KEGGREST1.32.0, KernSmooth2.23-20, knitr1.36, labeling0.4.2, later1.3.0, lattice0.20-45, lazyeval0.2.2, leiden0.3.9, lifecycle1.0.1, listenv0.8.0, lmtest0.9-38, magrittr2.0.1, MASS7.3-54, Matrix1.4-0, MatrixGenerics1.4.3, matrixStats0.61.0, memoise2.0.0, mgcv1.8-38, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-152, parallelly1.28.1, patchwork1.1.1, pbapply1.5-0, pheatmap1.0.12, pillar1.6.3, pkgconfig2.0.3, plotly4.9.4.1, plyr1.8.6, png0.1-7, polyclip1.10-0, promises1.2.0.1, purrr0.3.4, R62.5.1, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-2, Rcpp1.0.7, RcppAnnoy0.0.19, RCurl1.98-1.5, reshape21.4.4, reticulate1.22, rlang0.4.11, rmarkdown2.11, ROCR1.0-11, rpart4.1-15, RSQLite2.2.8, rstudioapi0.13, rsvd1.0.5, Rtsne0.15, rvest1.0.1, S4Vectors0.30.2, sass0.4.0, ScaledMatrix1.0.0, scales1.1.1, scattermore0.7, sctransform0.3.2, sessioninfo1.1.1, Seurat4.0.4, SeuratObject4.0.2, shiny1.7.0, SingleCellExperiment1.14.1, SingleR1.6.1, sparseMatrixStats1.4.2, spatstat.core2.3-0, spatstat.data2.1-0, spatstat.geom2.2-2, spatstat.sparse2.0-0, spatstat.utils2.2-0, stringi1.7.4, stringr1.4.0, SummarizedExperiment1.22.0, survival3.2-13, svglite2.0.0, systemfonts1.0.2, tensor1.5, tibble3.1.4, tidyr1.1.4, tidyselect1.1.1, utf81.2.2, uwot0.1.10, vctrs0.3.8, viridis0.6.1, viridisLite0.4.0, webshot0.5.2, withr2.4.2, xfun0.26, xml21.3.2, xtable1.8-4, XVector0.32.0, yaml2.2.1, zlibbioc1.38.0, zoo1.8-9 |