# R Options
options(stringsAsFactors=FALSE,
        dplyr.summarise.inform=FALSE, 
        knitr.table.format="html",
        kableExtra_view_html=TRUE,
        future.globals.maxSize=2000000000, mc.cores=1, 
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

# Required libraries
library(monocle3)
library(Seurat)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(magrittr)
library(biomaRt)
library(dplyr)
library(tidyr)

# Load Seurat S4 object; output of scrnaseq script https://github.com/ktrns/scrnaseq
load("/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/scrnaseq.RData")

# Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source='fold-hide',      # by default collapse code blocks
                      dev=c('png', 'pdf'),           # create figures in png and pdf; the first device (png) will be used for HTML output
                      dev.args=list(png=list(type="cairo"),  # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
                                    pdf=list(bg="white")),     # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
                      dpi=96,                        # figure resolution
                      fig.retina=2                   # retina multiplier
)
# displaying and testing different roots is possible, but only the last one is used in subsequent plots; At least one root needs to be provided!
param$trajectory_root = c("CORO1A")
# Choose clusters that should be displayed in trajectory (all clusters are used in calculation); NULL if all clusters should be displayed
#param$trajectory_cluster = NULL
param$trajectory_cluster = c("2", "4")
if (is.null(param$trajectory_cluster)) {
  param$trajectory_cluster = levels(sc$seurat_clusters)
}  
# individual genes that should be plotted in feature plots; NULL if no target genes are specified
#param$plot_genes = NULL
param$plot_genes = c("CD3E", "CD4", "EOMES", "CD22", "CD1D")
# Feature plots and Violinplots
# The height of 1 row (= 1 plot) is fixed to 5 
fig_height_plots = max(3.5, 3.5 + 3 * length(param$trajectory_root))

# Dimplots (for multiple conditions)
height_per_plot_genes = max(2.5, 2.5 * round(length(param$plot_genes)/3))

# Dotplots
# We fix the height of each row in a plot to the same height
height_per_row = max(0.2, 0.2 * 2 * length(param$trajectory_cluster))
fig_height_dotplots = max(3, 3 + height_per_row)

Dataset description

Constructing single-cell trajectories for the samples sample1, sample2 of the project pbmc_small (from publicly available single cell RNA-seq data).

During many biological processes, cells transition from one functional “state” to another or differentiate towards a required functional end state. Such events are characterized by the modulation of specific gene expression programs over time. By use of single cell expression signatures, it is, thus, possible to track down a cell’s ‘position’ within such a predetermined developmental path. The R packaged Monocle3 offers a strategy to explore respective expression changes when cells pass through such a ‘pseudotime’ matrix and to construct and visualize accordingly derived single-cell trajectories.

Here we use the S4 class object generated in the Main-Report “Single-cell RNA-seq data analysis” and convert it to a SingleCellExperiment class object for analysis of pseudotime trajectory of cells.

# Convert S4 class object to SingleCellExperiment class
cds = as.cell_data_set(sc)
## Warning: Monocle 3 trajectories require cluster partitions, which Seurat does
## not calculate. Please run 'cluster_cells' on your cell_data_set object
# Add gene_short_name to cds object
rowData(cds)$gene_short_name = rownames(cds)

# Transfer clusters from sc to cds object
cds = cluster_cells(cds)
cds@clusters$UMAP$clusters = sc@meta.data$seurat_clusters
names(cds@clusters$UMAP$clusters) = rownames(sc@meta.data)

# Print both objects' information
sc
## An object of class Seurat 
## 2092 features across 416 samples within 1 assay 
## Active assay: RNA (2092 features, 2092 variable features)
##  2 dimensional reductions calculated: pca, umap
cds
## class: cell_data_set 
## dim: 2092 416 
## metadata(1): citations
## assays(3): counts logcounts scaledata
## rownames(2092): FAM41C B3GALT6 ... MT-CO2 MT-ND4L
## rowData names(1): gene_short_name
## colnames(416): GCTTCACCAACACGAG-1 TAGTGCACAGAGGAAA-1 ...
##   TTCCTTCAGGTAGTCG-2 TCGATTTGTACGCTTA-2
## colData names(17): orig.ident nCount_RNA ... ident Size_Factor
## reducedDimNames(2): PCA UMAP
## mainExpName: RNA
## altExpNames(0):

Constructing single-cell trajectories

Classification of cells

While in some situations cells may continuously transition from one state to the next along one trajectory, in other cases multiple distinct trajectories might reflect the experimental situation more precisely. For example, different cell types have different initial transcriptomes and response to a stimulus by moving along distinct trajectories. Such distinct trajectories are identified as different “partitions” through the clustering procedure.

# Plot cells coloured by sample
p1 = DimPlot(sc, group.by = "orig.ident", cols = param$col_samples, pt.size = param$pt_size) +
  AddStyle(title="Coloured by sample", legend_title = "", xlab = "UMAP 1", ylab = "UMAP 2") +
  theme(text = element_text(size = 12), 
  axis.line = element_line(size = 0.5, color = "grey50"), 
  legend.position = "bottom") +
  NoGrid()

# Plot cells coloured by cluster
p2 = plot_cells(cds, show_trajectory_graph = FALSE, label_cell_groups = FALSE, cell_size = param$pt_size) +
  AddStyle(title="Coloured by cluster", xlab = "UMAP 1", ylab = "UMAP 2") +
  scale_color_manual(values = param$col_clusters) +
  theme(text = element_text(size = 12), 
  axis.line = element_line(size = 0.5, color = "grey50"), 
  legend.position = "bottom") +
  NoGrid()

# Plot cells coloured by partition
p3 = plot_cells(cds, color_cells_by = "partition", show_trajectory_graph = FALSE, label_cell_groups = FALSE, cell_size = param$pt_size) +
  AddStyle(title="Coloured by partition", xlab = "UMAP 1", ylab = "UMAP 2") +
  theme(text = element_text(size = 12), legend.title = element_blank(), 
  axis.line = element_line(size = 0.5, color = "grey50"), 
  legend.position = "bottom") +
  NoGrid()

p = patchwork::wrap_plots(p1, p2, p3, ncol = 3)
p

Plot pseudotime trajectory

Next, we fit a principal graph of gene expression changes within each partition and place each cell at its proper position according to its progress through the respective trajectory. To place the cells in order, a “root” of the trajectory has to be defined as the beginning of the biological process. That means, here we have set one gene (CORO1A) as root, which is (exclusively) expressed at the early state of the biological process. The progress along the trajectory an individual cell has made is measured in pseudotime. Pseudotime is an abstract unit that indicates the distance between a cell and the start of the trajectory. Here we are displaying three different visualisations for the pseudotime and cell trajectory. Labeled black circles indicate branches of the trajectory tree to different outcomes (i.e. cell fate), while the white circled number represents the root. Gray colored cells have infinite pseudotime because they were not reachable from the chosen root.

# Learn the trajectory graph
cds = learn_graph(cds, close_loop = TRUE)
# Plot trajectory with root and colored by clusters
p1 = plot_cells(cds, label_groups_by_cluster = FALSE, label_leaves = FALSE, label_branch_points = FALSE, graph_label_size=3, group_label_size = 0, cell_size = param$pt_size) +
  AddStyle(legend_title = "cluster", xlab = "UMAP 1", ylab = "UMAP 2") +
  scale_color_manual(values = param$col_clusters) +
  theme(text = element_text(size = 12), 
  axis.line = element_line(size = 0.5, color = "grey50"), 
  legend.position = "bottom") +
  NoGrid()

p_list = NULL

for (i in param$trajectory_root) {

# Integrate root to trajectory
max.root = which.max(unlist(FetchData(sc, i)))
max.root = colnames(sc)[max.root]
cds = order_cells(cds, root_cells = max.root)

# Extract the pseudotime and partition values form the SingleCellExperiment object and add them to the Seurat object
sc = AddMetaData(sc, metadata = cds@principal_graph_aux@listData$UMAP$pseudotime, col.name = "pseudotime")
sc = AddMetaData(sc, metadata = cds@clusters@listData$UMAP$partitions, col.name = "partition")



# Plot trajectory with root and colored by pseudotime
p2 = plot_cells(cds, color_cells_by = "pseudotime", label_cell_groups = FALSE, label_leaves = FALSE, label_branch_points = TRUE, graph_label_size=3, cell_size = param$pt_size) +
  theme(text = element_text(size = 12), 
  axis.line = element_line(size = 0.5, color = "grey50")) +
  NoGrid() +
  RestoreLegend(position = "right")

# Plot trajectory with root and colored by pseudotime as FeaturePlot with Seurat object
p3 = FeaturePlot(sc, features = "pseudotime", pt.size = param$pt_size) + 
  AddStyle(title=i, legend_title = "pseudotime", xlab = "UMAP 1", ylab = "UMAP 2") +
  scale_color_viridis_c() +
  theme(text = element_text(size = 12), 
  axis.line = element_line(size = 0.5, color = "grey50")) +
  NoGrid() +
  RestoreLegend(position = "right")

p_list[[i]] = p2 + p3
}

p1 = patchwork::wrap_plots(p1 + plot_spacer())
p = p1 + p_list + plot_layout(ncol=1) + 
  AddStyle(title="Pseudotime trajectory") 
p

Distribution of cell along trajectory

The clusters 2, 4, as part of one trajectory, were manually chosen and shown in the following. We display the relative counts of cells that were detected in the same state (same pseudotime point) along the cell trajectory.

# Test whether cells at similar positions on the trajectory have correlated expression
pseudotime_values = FetchData(object = sc, vars = c("pseudotime", "seurat_clusters", "orig.ident"))
pseudotime_values = filter(pseudotime_values, seurat_clusters %in% param$trajectory_cluster)

p1 = ggplot(pseudotime_values, aes(x = pseudotime, fill = seurat_clusters)) + 
  geom_density(alpha=0.7) +
  theme_classic() +
  AddStyle(legend_title = "cluster") +
  NoGrid() +
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line = element_line(size = 0.5)) +
  scale_fill_manual(values = param$col_clusters)

p2 = ggplot(pseudotime_values, aes(x = pseudotime, fill = seurat_clusters)) + 
  geom_density() +
  facet_grid(seurat_clusters ~ .) +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line = element_line(size = 0.5)) +
  scale_fill_manual(values = param$col_clusters) +
  NoLegend()

p3 = ggplot(pseudotime_values, aes(x = pseudotime, fill = orig.ident)) + 
  geom_density(alpha=0.7) +
  theme_classic() +
  AddStyle(legend_title = "sample", col = param$col_samples) +
  NoGrid() +
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line = element_line(size = 0.5)) +
  scale_fill_manual(values = param$col_samples)


p = patchwork::wrap_plots((p1 / p3) | p2)
p

Gene expression along cell trajectory

To identify the genes that are regulated over the course of the trajectory, we test with the Moran’s I test whether cells at similar positions on the trajectory have also correlated expression of individual genes.

# Test whether cells at similar positions on the trajectory have correlated expression
morans_test_res = graph_test(cds, neighbor_graph="principal_graph", cores=4)
write.table(x = morans_test_res, file = paste0(param$path_out, "/morans_test_result.xlsx"))

Genes expression as function of pseudotime

The plots display the dynamics of gene expression as a function of pseudotime for the top 12 genes with the highest correlation of expression to trajectory position.

# 
attach(morans_test_res)
morans_test_res_sort = morans_test_res[order(q_value, -morans_I),]
detach(morans_test_res)
top12 = row.names(head(morans_test_res_sort, 12))
top50 = row.names(head(morans_test_res_sort, 50))
top100 = row.names(head(morans_test_res_sort, 100))


p_list = NULL
for (i in top12) {
  p = FeatureScatter(subset(sc, idents = param$trajectory_cluster), feature1 = "pseudotime", feature2 = i, cols = param$col_clusters) +
    geom_smooth(method = "loess", color="black") +
    AddStyle(title = "") +
    theme(text = element_text(size = 12), 
    axis.line = element_line(size = 0.5, color = "grey50")) +
    NoGrid() +
    NoLegend()
  p_list[[i]] = p
}
p_list = patchwork::wrap_plots(p_list, ncol = 3)
p_list

Genes expression as function of pseudotime (per sample)

# 
orig_ident_levels = levels(sc$orig.ident)
sc_subsets = list()

for (h in orig_ident_levels) {
  sc_subsets[[h]] = subset(x = sc, subset = orig.ident == h)
}

  p_list = NULL
for (i in top12) {
  p_samples = purrr::map(list_names(sc_subsets), function(n) {
  p = Seurat::FeatureScatter(object = subset(sc_subsets[[n]], idents = param$trajectory_cluster), feature1 = "pseudotime", feature2 = i, cols = param$col_clusters) +
    geom_smooth(method = "loess", color="black") +
    AddStyle(title = "") +
    theme(text = element_text(size = 12), 
    axis.line = element_line(size = 0.5, color = "grey50")) +
    NoGrid() +
    NoLegend()
  })
p = patchwork::wrap_plots(p_samples, ncol=2)

  p_list[[i]] = p
  
}
p_list = patchwork::wrap_plots(p_list, ncol = 2)
p_list

Dotplot

The dotplot displays the average gene expression per cluster for the top 50 genes with the highest correlation of expression to trajectory position.

p = DotPlot(sc, features = top50, group.by = "seurat_clusters", split.by = "orig.ident", cols = param$col_samples, idents = param$trajectory_cluster) +
  RotatedAxis() +
  theme(axis.text.x.bottom = element_text(size = 8)) +
  RestoreLegend()
p
## Warning: Removed 8 rows containing missing values (geom_point).

Heatmap

The heatmap displays the gene expression for the top 50 genes with the highest correlation of expression to trajectory position.

p = DoHeatmap(subset(sc, idents = param$trajectory_cluster), features = top100, group.colors=param$col_clusters)
p

Plot pseudotime for target genes

Visualization of gene expression of individual target genes.

if (!is.null(param$plot_genes)) {
  # Plot pseudotime for target gene
p = plot_cells(cds, genes = param$plot_genes, scale_to_range = TRUE, label_cell_groups=FALSE, label_leaves = FALSE, graph_label_size = 3, cell_size = param$pt_size) +
    theme(text = element_text(size = 12), axis.line = element_line(size = 0.5, color = "grey50"), plot.title = element_text(size = 12)) +
    NoGrid() +
    RestoreLegend(position = "right")
p
} else {message("No target gene selected")}

# Plot feature plot for target gene
p_list = NULL
for (i in param$plot_genes) {
  p = FeaturePlot(sc, features = i, pt.size = param$pt_size) +
    AddStyle(legend_title = "expression", xlab = "UMAP 1", ylab = "UMAP 2", title = i) +
    theme(text = element_text(size = 12), axis.line = element_line(size = 0.5, color = "grey50"), plot.title = element_text(size = 10, vjust = 0, hjust = 0.5)) +
    NoGrid() +
   RestoreLegend(position = "right")
  p_list[[i]] = p
}

p = patchwork::wrap_plots(p_list, ncol = 3)
p

Parameters and software versions

The following parameters were used to run the workflow.

out = ScrnaseqParamsInfo(params=param)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")
Name Value
project_id pbmc_small (from publicly available single cell RNA-seq data)
path_data name:sample1, sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample1/, /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample2/; stats:NA, NA; suffix:-1, -2
assay_raw RNA
path_out /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results/
mart_dataset hsapiens_gene_ensembl
annot_version 98
annot_main ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession
mart_attributes ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description
mt ^MT-
cell_filter sample1:nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15); sample2:nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15)
feature_filter sample1:min_counts=1, min_cells=5; sample2:min_counts=1, min_cells=5
samples_min_cells 10
norm RNA
cc_remove FALSE
cc_remove_all FALSE
cc_rescore_after_merge FALSE
integrate_samples method:merge
pc_n 10
cluster_resolution 0.8
cluster_resolution_test 0.4, 0.6, 1.2
marker_padj 0.05
marker_log2FC 1
marker_pct 0.25
enrichr_site Enrichr
enrichr_padj 0.05
enrichr_dbs GO_Biological_Process_2021, KEGG_2021_Human, WikiPathway_2021_Human
col palevioletred
col_palette_samples ggsci::pal_rickandmorty
col_palette_clusters ggsci::pal_igv
pt_size 1
path_to_git .
debugging_mode default_debugging
file_annot /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results//annotation/hsapiens_gene_ensembl.v98.annot.txt
file_cc_genes /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results//annotation/cell_cycle_markers.xlsx
downsample_cells_n
biomart_mirror www
samples_to_drop
col_samples sample1=#FAFD7CFF, sample2=#82491EFF
col_clusters 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF
trajectory_root CORO1A
trajectory_cluster 2, 4
plot_genes CD3E, CD4, EOMES, CD22, CD1D

This report was created with generated using the scrnaseq_add_condition_comparison script. Software versions were collected at run time.

out = ScrnaseqSessionInfo(param$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Name Version
ktrns/scrnaseq 62ed87b7523480a9cf0876855fd2ce0ca242a331
R R version 4.1.2 (2021-11-01)
Platform x86_64-pc-linux-gnu (64-bit)
Operating system Ubuntu 20.04.3 LTS
Packages abind1.4-5, AnnotationDbi1.54.1, assertthat0.2.1, Biobase2.52.0, BiocFileCache2.0.0, BiocGenerics0.38.0, BiocManager1.30.16, biomaRt2.48.3, Biostrings2.60.2, bit4.0.4, bit644.0.5, bitops1.0-7, blob1.2.2, boot1.3-28, bslib0.3.0, cachem1.0.6, class7.3-20, classInt0.4-3, cli3.0.1, cluster2.1.2, codetools0.2-18, colorspace2.0-2, cowplot1.1.1, crayon1.4.1, curl4.3.2, data.table1.14.2, DBI1.1.1, dbplyr2.1.1, DelayedArray0.18.0, deldir0.2-10, digest0.6.28, dplyr1.0.7, e10711.7-9, ellipsis0.3.2, evaluate0.14, fansi0.5.0, farver2.1.0, fastmap1.1.0, filelock1.0.2, fitdistrplus1.1-6, future1.22.1, future.apply1.8.1, generics0.1.0, GenomeInfoDb1.28.4, GenomeInfoDbData1.2.6, GenomicRanges1.44.0, ggplot23.3.5, ggrepel0.9.1, ggridges0.5.3, globals0.14.0, glue1.4.2, goftest1.2-2, gridExtra2.3, gtable0.3.0, highr0.9, hms1.1.1, htmltools0.5.2, htmlwidgets1.5.4, httpuv1.6.3, httr1.4.2, ica1.0-2, igraph1.2.6, IRanges2.26.0, irlba2.3.3, jquerylib0.1.4, jsonlite1.7.2, kableExtra1.3.4, KEGGREST1.32.0, KernSmooth2.23-20, knitr1.36, labeling0.4.2, later1.3.0, lattice0.20-45, lazyeval0.2.2, leiden0.3.9, leidenbase0.1.3, lifecycle1.0.1, listenv0.8.0, lmtest0.9-38, magrittr2.0.1, MASS7.3-55, Matrix1.4-0, MatrixGenerics1.4.3, matrixStats0.61.0, memoise2.0.0, mgcv1.8-38, mime0.12, miniUI0.1.1.1, monocle31.0.0, munsell0.5.0, nlme3.1-155, parallelly1.28.1, patchwork1.1.1, pbapply1.5-0, pbmcapply1.5.0, pillar1.6.3, pkgconfig2.0.3, plotly4.9.4.1, plyr1.8.6, png0.1-7, polyclip1.10-0, prettyunits1.1.1, progress1.2.2, promises1.2.0.1, proxy0.4-26, purrr0.3.4, R.methodsS31.8.1, R.oo1.24.0, R.utils2.11.0, R62.5.1, RANN2.6.1, rappdirs0.3.3, raster3.5-11, RColorBrewer1.1-2, Rcpp1.0.7, RcppAnnoy0.0.19, RCurl1.98-1.5, remotes2.4.1, reshape21.4.4, reticulate1.22, rlang0.4.11, rmarkdown2.11, ROCR1.0-11, rpart4.1.16, RSQLite2.2.8, rstudioapi0.13, rsvd1.0.5, Rtsne0.15, rvest1.0.1, s21.0.7, S4Vectors0.30.2, sass0.4.0, scales1.1.1, scattermore0.7, sctransform0.3.2, sessioninfo1.1.1, Seurat4.0.4, SeuratObject4.0.2, SeuratWrappers0.3.0, sf1.0-5, shiny1.7.0, SingleCellExperiment1.14.1, slam0.1-50, sp1.4-6, spatstat.core2.3-0, spatstat.data2.1-0, spatstat.geom2.2-2, spatstat.sparse2.0-0, spatstat.utils2.2-0, spData2.0.1, spdep1.2-1, stringi1.7.4, stringr1.4.0, SummarizedExperiment1.22.0, survival3.2-13, svglite2.0.0, systemfonts1.0.2, tensor1.5, terra1.5-12, tibble3.1.4, tidyr1.1.4, tidyselect1.1.1, units0.7-2, utf81.2.2, uwot0.1.10, vctrs0.3.8, viridis0.6.1, viridisLite0.4.0, webshot0.5.2, withr2.4.2, wk0.6.0, xfun0.26, XML3.99-0.8, xml21.3.2, xtable1.8-4, XVector0.32.0, yaml2.2.1, zlibbioc1.38.0, zoo1.8-9