Generating an endometrial atlas

Published

June 30, 2025

Load packages

suppressPackageStartupMessages({
    library(fishpond)
    library(SummarizedExperiment)
    library(SingleCellExperiment)
    library(scuttle)
    library(scater)
    library(scran)
    library(dplyr)
    library(tidyr)
    library(ggplot2)
    library(msigdbr)
    library(batchelor)
    library(cowplot)
    library(ComplexHeatmap)
    library(circlize)
    library(scDblFinder)
    library(Seurat)
    library(sketchR)
    library(tricycle)
    library(org.Hs.eg.db)
    library(parallel)
    library(bluster)
    library(BiocParallel)
    library(igraph)
    library(DT)
    library(SingleR)
})

Define parameters and reference files

samples <- c("4861STDY7387181", "4861STDY7387182", "4861STDY7387183", 
             "4861STDY7771115", "4861STDY7771123", "MRC_Endo8625698", 
             "MRC_Endo8625699", "MRC_Endo8712024", "MRC_Endo8712032", 
             "MRC_Endo8715415", "MRC_Endo8715416", "GSM4577306", "GSM4577307", 
             "GSM4577308", "GSM4577309", "GSM4577310", "GSM4577311", 
             "GSM4577312", "GSM4577313", "GSM4577314", "GSM4577315", 
             "Shih_MEtissue1", "Shih_MEtissue2", "Shih_MEtissue3", 
             "Shih_MEtissue4", "Shih_MEtissue5", "Shih_MEtissue6", 
             "Shih_MEtissue7", "Huang_GSM6605437", "Huang_GSM6605438", 
             "Huang_GSM6605440", "Huang_GSM6605439", "Huang_GSM7277296", 
             "Huang_GSM7277297", "Huang_GSM7277298", "Lai_GSM5572238", 
             "Lai_GSM5572239", "Lai_GSM5572240")
blockvar <- "sample"           ## used as blocking factor in doublet annotation
batchvar <- "donorCol"         ## used for batch correction
datasetvar <- "matchDataset"   ## used as blocking factor in HVG annotation

## Sample annotation
sampleannot <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/invivo_atlas_samples.txt"

## Gene annotation (derived from GENCODE v38 gtf file)
t2g <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/reference/gencode.v38.annotation.tx2gene_full.rds"

## Reproductive cell atlas data (downloaded from 
## https://www.reproductivecellatlas.org/ and converted to SingleCellExperiment 
## format)
rca <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/data/reproductivecellatlas/2021/endometrium_all.rds"
heca <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/data/reproductivecellatlas/2023/endometriumAtlasV2_cells_with_counts.rds"
hecaimmune <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/data/reproductivecellatlas/2023/endometriumAtlasV2_cells_immune.rds"

## Cell cycle marker genes
## https://github.com/ventolab/HECA-Human-Endometrial-Cell-Atlas/tree/main/utils
s_genes <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/data/reproductivecellatlas/2023/S_genes_HECA.tsv"
g2m_genes <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/data/reproductivecellatlas/2023/G2M_genes_HECA.tsv"

## Shih et al cell annotations (obtained from 
## https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE203191)
shih_cellannot <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/data/GSE203191/GSE203191_Shih_endo_meta_frame.tsv"

## Epithelial atlas
epiatlas <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/invivo_epithelial_cell_atlas.rds"

Assemble gene annotation

geneannot <- readRDS(t2g) |>
    dplyr::select(gene_id, seqnames, gene_type, gene_name) |>
    dplyr::rename(chromosome = seqnames) |>
    dplyr::distinct()

## Annotate ribosomal protein genes
## GO term "structural constituent of ribosome" (GO:0003735)
golist <- as.list(org.Hs.egGO2ALLEGS)
ribo_entrez <- unique(golist[["GO:0003735"]])
length(ribo_entrez)
[1] 175
tmp <- select(org.Hs.eg.db, keys = ribo_entrez, columns = "ENSEMBL", 
              keytype = "ENTREZID")
'select()' returned 1:many mapping between keys and columns
ribo_ensembl <- unique(na.omit(tmp[, "ENSEMBL"]))
length(ribo_ensembl)
[1] 195
summary(ribo_ensembl %in% sub("\\.[0-9]+$", "", geneannot$gene_id))
   Mode   FALSE    TRUE 
logical      33     162 
geneannot$is_ribosomal_protein <- 
    sub("\\.[0-9]+$", "", geneannot$gene_id) %in% ribo_ensembl
geneannot$gene_type_ribo <- ifelse(
    geneannot$gene_type == "protein_coding",
    ifelse(geneannot$is_ribosomal_protein, "protein_coding_ribosome",
           geneannot$gene_type),
    geneannot$gene_type)

Read data

## Get sample IDs
tbl <- read.delim(sampleannot) |>
     dplyr::filter(SampleName %in% samples)
DT::datatable(tbl, options = list(scrollX = TRUE, pageLength = 50))
sns <- unique(tbl$SampleName)
names(sns) <- sns

## Read alevin-fry data and combine
dat <- do.call(cbind, lapply(sns, function(s) {
    x <- fishpond::loadFry(
        fryDir = file.path("../scRNAseq_invivo_endometrium_epithelial_cell_atlas", 
                           "data", "alevin_fry_splici", s), 
        outputFormat = list(counts = c("S", "A"), spliced = c("S", "A"),
                            unspliced = c("U"), total = c("U", "S", "A")))
    rowData(x)$gene_name <- 
        geneannot$gene_name[match(rowData(x)$gene_ids, geneannot$gene_id)]
    rowData(x)$chromosome <- 
        geneannot$chromosome[match(rowData(x)$gene_ids, geneannot$gene_id)]
    rowData(x)$gene_type <- 
        geneannot$gene_type[match(rowData(x)$gene_ids, geneannot$gene_id)]
    rowData(x)$gene_type_ribo <- 
        geneannot$gene_type_ribo[match(rowData(x)$gene_ids, geneannot$gene_id)]
    
    ## Keep only cells with at least 1,000 detected genes
    x <- x[, colSums(assay(x, "total") > 0) > 1000]
    
    ## Add sample ID to colnames
    x$sample <- s
    colnames(x) <- paste0(s, "_", colnames(x))
    x
}))
dat
class: SingleCellExperiment 
dim: 60649 274348 
metadata(0):
assays(4): counts spliced unspliced total
rownames(60649): ENSG00000223972.5 ENSG00000243485.5 ...
  ENSG00000210194.1 ENSG00000210196.2
rowData names(5): gene_ids gene_name chromosome gene_type
  gene_type_ribo
colnames(274348): 4861STDY7387181_AGAGTGGGTCTAGCGC
  4861STDY7387181_TAAGAGATCTGAGTGT ...
  Huang_GSM7277298_CTATCCGTCTTTCGAT Huang_GSM7277298_GGGTTTACAGAACTTC
colData names(2): barcodes sample
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
## Set rownames to gene names where possible
rownames(dat) <- scater::uniquifyFeatureNames(
    ID = rownames(dat), 
    names = rowData(dat)$gene_name
)

Add sample annotations

## Extract barcodes corresponding to control samples in Shih data and remove 
## all other barcodes from the Shih samples
cb <- read.delim(shih_cellannot) |>
    dplyr::filter(pheno == "Control") |>
    dplyr::mutate(barcode = gsub("-[0-9]$", "", barcode)) |>
    dplyr::mutate(sample_cb = paste("Shih", run, barcode, sep = "_")) |>
    dplyr::pull(sample_cb)
to_remove <- which(grepl("Shih", dat$sample) & 
                       !paste0(dat$sample, "_", dat$barcodes) %in% cb)
dat <- dat[, -to_remove]
dim(dat)
[1]  60649 253579
## Add 'cell' column
dat$cell <- colnames(dat)

## Any other provided annotations
tbl2 <- tbl |>
    dplyr::filter(RunID != "Shih")
if (nrow(tbl2) > 0 && (length(tbl2$SampleName) == length(unique(tbl2$SampleName)))) {
    for (cn in colnames(tbl2)) {
        if (!all(is.na(tbl2[[cn]])) && !(cn %in% colnames(colData(dat)))) {
            if (cn == "Day") {
                colData(dat)[[cn]] <- paste0(
                    "D", tbl2[[cn]][match(dat$sample, tbl2$SampleName)])
            } else if (cn == "RunID") {
                colData(dat)[[cn]] <- sub("_[0-9]+$", "", 
                                          tbl2[[cn]][match(dat$sample, 
                                                           tbl2$SampleName)])
            } else {
                colData(dat)[[cn]] <- tbl2[[cn]][match(dat$sample, tbl2$SampleName)]
            }
        }
    }
}

## Harmonize annotation values
dat$Stage <- gsub(" stage", "", dat$Stage)

## Add information for the Shih data set
shih_annot <- read.delim(shih_cellannot) |>
    dplyr::mutate(cell = paste0("Shih_", run, "_", gsub("-[0-9]$", "", barcode)))
if (any(dat$cell %in% shih_annot$cell)) {
    for (cn in c("clusterID", "pheno", "Phase", "subjectID", "NKsubclusterID",
                 "StromalSubclusterID")) {
        colData(dat)[[paste0("Shih_", cn)]] <- 
            shih_annot[[cn]][match(dat$cell, shih_annot$cell)]
    }
}
dat$RunID[grep("Shih", dat$sample)] <- "Shih"
dat$Stage[grep("Shih", dat$sample)] <- "menstrual"

dat$StageFactor <- factor(dat$Stage, 
                          levels = c("menstrual", "proliferative",
                                     "early-secretory", "early-mid-secretory",
                                     "mid-secretory", "late-secretory"))
dat$StageCoarse <- sub("early-|early-mid-|mid-|late-", "", dat$Stage)

## Create a 'donorCol' column
dat$donorCol <- NA_character_
if ("DonorID" %in% colnames(colData(dat))) {
    idx <- which(!is.na(dat[["DonorID"]]))
    dat$donorCol[idx] <- dat[["DonorID"]][idx]
}
if ("Shih_subjectID" %in% colnames(colData(dat))) {
    idx <- which(!is.na(dat[["Shih_subjectID"]]))
    dat$donorCol[idx] <- dat[["Shih_subjectID"]][idx]
}

## Create a 'sampleCol' column (either sample or Shih_subjectID)
dat$sampleCol <- NA_character_
if ("sample" %in% colnames(colData(dat))) {
    idx <- which(!is.na(dat[["sample"]]))
    dat$sampleCol[idx] <- dat[["sample"]][idx]
}
if ("Shih_subjectID" %in% colnames(colData(dat))) {
    idx <- which(!is.na(dat[["Shih_subjectID"]]))
    dat$sampleCol[idx] <- dat[["Shih_subjectID"]][idx]
}

## Generate columns to match to HECA
dat$matchDataset <- ifelse(dat$RunID %in% c("Wang", "Shih", "Lai", "Huang"), 
                           dat$RunID, "GarciaAlonso")
dat$matchCell <- sub("^Huang_|^Lai_|^Shih_", "", dat$cell)
dat$matchId <- paste0(dat$matchCell, "-", dat$matchDataset)

## Add annotations from the reproductive cell atlas (2021)
rca <- readRDS(rca)
table(dat$matchDataset, colnames(dat) %in% colnames(rca), useNA = "ifany")
              
               FALSE  TRUE
  GarciaAlonso 28396 40337
  Huang        63190     0
  Lai          18499     0
  Shih          3110     0
  Wang         47223 52824
dat$RCA_CellCyclePhase <- rca$`CellCycle Phase`[match(colnames(dat), colnames(rca))]
dat$RCA_Celltype <- rca$`Cell type`[match(colnames(dat), colnames(rca))]
dat$RCA_percentmito <- rca$percent_mito[match(colnames(dat), colnames(rca))]
dat$RCA_ngenes <- rca$n_genes[match(colnames(dat), colnames(rca))]
dat$RCA_BiopsyType <- rca$BiopsyType[match(colnames(dat), colnames(rca))]
dat$RCA_Location <- rca$Location[match(colnames(dat), colnames(rca))]
dat$RCA_BroadCelltype <- rca$`Broad cell type`[match(colnames(dat), colnames(rca))]

## Add annotations from HECA (2023)
heca <- readRDS(heca)
hecaimmune <- readRDS(hecaimmune)
table(dat$matchDataset, dat$matchId %in% colnames(heca), useNA = "ifany")
              
               FALSE  TRUE
  GarciaAlonso 37041 31692
  Huang        34420 28770
  Lai           6227 12272
  Shih          3110     0
  Wang         46329 53718
dat$HECA_CellCyclePhase <- as.character(heca$phase[match(dat$matchId, colnames(heca))])
dat$HECA_Celltype <- as.character(heca$celltype[match(dat$matchId, colnames(heca))])
dat$HECA_CelltypeImmune <- as.character(hecaimmune$celltype[match(dat$matchId, colnames(hecaimmune))])
dat$HECA_ngenes <- heca$n_genes[match(dat$matchId, colnames(heca))]
dat$HECA_percentmito <- heca$percent_mito[match(dat$matchId, colnames(heca))]
dat$HECA_ScrubletScore <- heca$scrublet_score[match(dat$matchId, colnames(heca))]
dat$HECA_Stage <- as.character(heca$Stage[match(dat$matchId, colnames(heca))])
dat$HECA_Lineage <- as.character(heca$lineage[match(dat$matchId, colnames(heca))])
dat$HECA_Label <- as.character(heca$label_long[match(dat$matchId, colnames(heca))])

## Filter to only keep cells that have a value for the batch variable
(dat <- dat[, !is.na(dat[[batchvar]])])
class: SingleCellExperiment 
dim: 60649 253579 
metadata(0):
assays(4): counts spliced unspliced total
rownames(60649): DDX11L1 MIR1302-2HG ... MT-TE MT-TP
rowData names(5): gene_ids gene_name chromosome gene_type
  gene_type_ribo
colnames(253579): 4861STDY7387181_AGAGTGGGTCTAGCGC
  4861STDY7387181_TAAGAGATCTGAGTGT ...
  Huang_GSM7277298_CTATCCGTCTTTCGAT Huang_GSM7277298_GGGTTTACAGAACTTC
colData names(50): barcodes sample ... HECA_Lineage HECA_Label
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
## Sample annotation summary
DT::datatable(as.data.frame(colData(dat)) |>
                  dplyr::select(sample, SampleType, DonorID, SampleName, RunID,
                                ReadLength1, ReadLength2, Tissue, Stage,
                                Digestion, Protocol, Source, Sex, 
                                TimeInMenstrualCycle, DiseaseState, 
                                Fecundity, Phenotype, DevStage,
                                Shih_pheno, Shih_subjectID, 
                                StageFactor, StageCoarse, donorCol, sampleCol,
                                matchDataset) |>
                  dplyr::distinct(), 
              rownames = FALSE,
              options = list(scrollX = TRUE, pageLength = 50))

Check total count for each gene type and removed unwanted gene types

keepGeneTypes <- c("protein_coding", "IG_V_gene", "IG_C_gene", "IG_J_gene",
                   "TR_C_gene", "TR_J_gene", "TR_V_gene", "TR_D_gene",
                   "IG_D_gene", "Mt_tRNA", "Mt_rRNA")

pd <- data.frame(gene = rownames(dat), 
                 gene_type_ribo = rowData(dat)$gene_type_ribo, 
                 total_count = rowSums(counts(dat))) |>
    group_by(gene_type_ribo) |>
    summarize(nbr_genes = length(gene), 
              total_count = sum(total_count)) |>
    mutate(keep = gene_type_ribo %in% keepGeneTypes) |>
    arrange(desc(nbr_genes))
pd$gene_type_ribo <- factor(pd$gene_type_ribo, levels = unique(pd$gene_type_ribo))

p1 <- ggplot(pd, aes(x = gene_type_ribo, y = nbr_genes, fill = keep)) +
    geom_col() +
    labs(x = "", y = "Number of genes", fill = "keep") +
    theme_cowplot() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- ggplot(pd, aes(x = gene_type_ribo, y = total_count, fill = keep)) +
    geom_col() +
    labs(x = "", y = "Total UMI count", fill = "keep") +
    theme_cowplot() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot_grid(p1, p2, ncol = 1)

dat <- dat[rowData(dat)$gene_type_ribo %in% keepGeneTypes, ]
dim(dat)
[1]  20231 253579

Add QC metrics

(MT <- grep("^MT-", rownames(dat), value = TRUE))
 [1] "MT-TF"   "MT-RNR1" "MT-TV"   "MT-RNR2" "MT-TL1"  "MT-ND1"  "MT-TI"  
 [8] "MT-TM"   "MT-ND2"  "MT-TW"   "MT-CO1"  "MT-TD"   "MT-CO2"  "MT-TK"  
[15] "MT-ATP8" "MT-ATP6" "MT-CO3"  "MT-TG"   "MT-ND3"  "MT-TR"   "MT-ND4L"
[22] "MT-ND4"  "MT-TH"   "MT-TS2"  "MT-TL2"  "MT-ND5"  "MT-CYB"  "MT-TT"  
[29] "MT-TQ"   "MT-TA"   "MT-TN"   "MT-TC"   "MT-TY"   "MT-TS1"  "MT-ND6" 
[36] "MT-TE"   "MT-TP"  
dat <- scuttle::addPerCellQC(dat, assay.type = "counts", subsets = list(MT = MT))
dat <- scuttle::addPerFeatureQC(dat, assay.type = "counts")
dat$intronic_fraction <- 
    colSums(assay(dat, "unspliced")) / colSums(assay(dat, "total"))
rowData(dat)$intronic_fraction <- 
    rowSums(assay(dat, "unspliced")) / rowSums(assay(dat, "total"))

QC plots

total_count_threshold <- 1000
detected_threshold <- 500
mt_percent_threshold <- 25

dat$keep <- (dat$total > total_count_threshold) & 
    (dat$detected > detected_threshold) & 
    (dat$subsets_MT_percent < mt_percent_threshold)
table(dat$keep)

 FALSE   TRUE 
 79933 173646 
cdplot <- as.data.frame(colData(dat))

sid <- "sample"
stid <- "StageFactor"

ggplot(cdplot, aes(x = .data[[sid]], y = sum, fill = .data[[stid]])) +
    geom_hline(yintercept = total_count_threshold) + 
    geom_violin(scale = "width") + theme_minimal() +
    labs(y = "Total UMI count per cell") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(cdplot, aes(x = .data[[sid]], y = sum, fill = .data[[stid]])) +
    geom_hline(yintercept = total_count_threshold) + 
    geom_violin(scale = "width") + theme_minimal() +
    scale_y_log10() + 
    labs(y = "Total UMI count per cell") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(cdplot, aes(x = .data[[sid]], y = detected, fill = .data[[stid]])) +
    geom_hline(yintercept = detected_threshold) + 
    geom_violin(scale = "width") + theme_minimal() +
    labs(y = "Number of detected genes per cell") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(cdplot, aes(x = .data[[sid]], y = subsets_MT_percent, fill = .data[[stid]])) +
    geom_hline(yintercept = mt_percent_threshold) + 
    geom_violin(scale = "width") + theme_minimal() +
    labs(y = "Percentage of MT count per cell") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(cdplot, aes(x = detected, y = subsets_MT_percent, color = keep)) + 
    geom_point(size = 0.5, alpha = 0.5) + theme_minimal() + 
    labs(x = "Number of detected genes per cell", 
         y = "Percentage of MT count per cell")

ggplot(cdplot, aes(x = detected, y = intronic_fraction, color = keep)) + 
    geom_point(size = 0.5, alpha = 0.5) + theme_minimal() + 
    labs(x = "Number of detected genes per cell",
         y = "Intronic fraction per cell")

Filter and remove samples with too few cells

(tbl <- table(sample = dat$sample, keep = dat$keep))
                  keep
sample             FALSE  TRUE
  4861STDY7387181   1640  8886
  4861STDY7387182   1500  8571
  4861STDY7387183    738  8010
  4861STDY7771115    663  9339
  4861STDY7771123    431  6338
  GSM4577306        2025  1914
  GSM4577307        2594  2352
  GSM4577308        2704  9881
  GSM4577309       13724 19364
  GSM4577310        8149  8432
  GSM4577311        2749  2339
  GSM4577312        1643  5423
  GSM4577313        1433  3723
  GSM4577314        2378  3080
  GSM4577315        1979  4161
  Huang_GSM6605437  1779  6882
  Huang_GSM6605438  2182  6079
  Huang_GSM6605439   635 11971
  Huang_GSM6605440   800  5999
  Huang_GSM7277296  1071  5766
  Huang_GSM7277297  1640 11027
  Huang_GSM7277298   546  6813
  Lai_GSM5572238    5061  1901
  Lai_GSM5572239    3430  2166
  Lai_GSM5572240    2664  3277
  MRC_Endo8625698   2699  3331
  MRC_Endo8625699   4432  1628
  MRC_Endo8712024    970   450
  MRC_Endo8712032   1450   399
  MRC_Endo8715415    563   416
  MRC_Endo8715416   4784  1495
  Shih_MEtissue1     248   343
  Shih_MEtissue2     411   966
  Shih_MEtissue3     218   924
ggplot(as.data.frame(tbl), aes(x = sample, y = Freq, fill = keep)) + 
    geom_col(position = "stack") + theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(x = "", y = "Number of cells")

ggplot(as.data.frame(tbl), aes(x = sample, y = Freq, fill = keep)) + 
    geom_col(position = "fill") + theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(x = "", y = "Fraction of cells")

dat <- dat[, which(dat$keep)]
dim(dat)
[1]  20231 173646
(tbl <- sort(table(dat$sampleCol)))

      TB13021372       TB13021061  MRC_Endo8712032  MRC_Endo8715415 
             343              387              399              416 
 MRC_Endo8712024       TB13021576       TB13020851  MRC_Endo8715416 
             450              579              924             1495 
 MRC_Endo8625699   Lai_GSM5572238       GSM4577306   Lai_GSM5572239 
            1628             1901             1914             2166 
      GSM4577311       GSM4577307       GSM4577314   Lai_GSM5572240 
            2339             2352             3080             3277 
 MRC_Endo8625698       GSM4577313       GSM4577315       GSM4577312 
            3331             3723             4161             5423 
Huang_GSM7277296 Huang_GSM6605440 Huang_GSM6605438  4861STDY7771123 
            5766             5999             6079             6338 
Huang_GSM7277298 Huang_GSM6605437  4861STDY7387183       GSM4577310 
            6813             6882             8010             8432 
 4861STDY7387182  4861STDY7387181  4861STDY7771115       GSM4577308 
            8571             8886             9339             9881 
Huang_GSM7277297 Huang_GSM6605439       GSM4577309 
           11027            11971            19364 
keepsamples <- names(tbl)[tbl > 30]
dat <- dat[, dat$sampleCol %in% keepsamples]
dim(dat)
[1]  20231 173646
tbl <- table(dat[[sid]], dat[[stid]])
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
             cluster_columns = FALSE, cluster_rows = FALSE, 
             row_names_gp = gpar(fontsize = 8),
             name = "Number of\ncells per\nsample",
             rect_gp = gpar(col = "grey", lwd = 1),
             cell_fun = function(j, i, x, y, width, height, fill) {
                 grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
             }))

Recalculate QC metrics for genes based on remaining cells

dat <- scuttle::addPerFeatureQC(dat, assay.type = "counts")
rowData(dat)$intronic_fraction <- 
    rowSums(assay(dat, "unspliced")) / rowSums(assay(dat, "total"))

Most highly expressed genes

scater::plotHighestExprs(dat, n = 25, colour_cells_by = stid, 
                         exprs_values = "counts")

Calculate size factors and logcounts

size_factor_threshold <- 0.1
set.seed(123)
clusters <- scran::quickCluster(
    dat, method = "igraph", min.mean = 0.1,
    use.ranks = FALSE, BSPARAM = BiocSingular::ExactParam(),
    block = dat[[datasetvar]], 
    min.size = min(table(dat[[datasetvar]]))
)
dat <- scran::computeSumFactors(dat, min.mean = 0.1, cluster = clusters)
summary(sizeFactors(dat))
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.07845  0.36578  0.72227  1.00000  1.25364 22.57632 
ggplot(data.frame(libSize = dat$sum,
                  sizeFactor = sizeFactors(dat),
                  group = dat[[blockvar]],
                  stringsAsFactors = FALSE),
       aes(x = libSize, y = sizeFactor, color = group)) + 
    geom_point(alpha = 0.5) + theme_bw() + 
    geom_hline(yintercept = size_factor_threshold, linetype = "dashed") + 
    xlab("Total UMI count") + ylab("Size factor")

dat <- dat[, sizeFactors(dat) > size_factor_threshold]

dat <- batchelor::multiBatchNorm(dat, batch = dat[[batchvar]], min.mean = 0.1)
ggplot(data.frame(libSize = dat$sum,
                  sizeFactor = sizeFactors(dat),
                  group = dat[[blockvar]],
                  stringsAsFactors = FALSE),
       aes(x = libSize, y = sizeFactor, color = group)) + 
    geom_point(alpha = 0.5) + theme_bw() + 
    geom_hline(yintercept = size_factor_threshold, linetype = "dashed") + 
    xlab("Total UMI count") + ylab("Size factor")

Cell cycle assignment

## tricycle
set.seed(123)
dat <- project_cycle_space(dat, species = "human", gname.type = "SYMBOL")
dat <- estimate_cycle_position(dat)
dat <- estimate_Schwabe_stage(dat, gname.type = "SYMBOL", species = "human", 
                              batch.v = batchvar, tolerance = 0)
plotReducedDim(dat, "tricycleEmbedding", point_size = 0.5, 
               colour_by = "CCStage")

tbl <- table(dat$StageFactor, dat$CCStage)
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
             cluster_columns = FALSE, cluster_rows = FALSE, 
             row_names_gp = gpar(fontsize = 8),
             name = "Number of\ncells",
             rect_gp = gpar(col = "grey", lwd = 1),
             cell_fun = function(j, i, x, y, width, height, fill) {
                 grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
             }))

## Seurat (using genes from HECA)
s_genes <- read.delim(s_genes)[, 1]
g2m_genes <- read.delim(g2m_genes)[, 1]
seurat <- CreateSeuratObject(counts = assay(dat, "counts"))
seurat <- NormalizeData(seurat, verbose = FALSE)
seurat <- FindVariableFeatures(seurat, selection.method = "vst", verbose = FALSE)
seurat <- ScaleData(seurat, features = rownames(seurat), verbose = FALSE)
seurat <- CellCycleScoring(seurat, s.features = s_genes, g2m.features = g2m_genes, 
                           set.ident = TRUE)
stopifnot(all(rownames(seurat@meta.data) == colnames(dat)))
dat$SeuratCC <- seurat@meta.data$Phase
tbl <- table(dat$StageFactor, dat$SeuratCC)
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
             cluster_columns = FALSE, cluster_rows = FALSE, 
             row_names_gp = gpar(fontsize = 8),
             name = "Number of\ncells",
             rect_gp = gpar(col = "grey", lwd = 1),
             cell_fun = function(j, i, x, y, width, height, fill) {
                 grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
             }))

Find and mark doublets

set.seed(123)
dat <- scDblFinder(dat, samples = blockvar, verbose = FALSE, 
                   BPPARAM = MulticoreParam(workers = 8L))
table(dat$scDblFinder.class, dat[[blockvar]])
         
          4861STDY7387181 4861STDY7387182 4861STDY7387183 4861STDY7771115
  singlet            7861            7602            7306            8592
  doublet            1024             969             704             746
         
          4861STDY7771123 GSM4577306 GSM4577307 GSM4577308 GSM4577309
  singlet            5772       1825       2228       8607      16657
  doublet             558         89        124       1220       2694
         
          GSM4577310 GSM4577311 GSM4577312 GSM4577313 GSM4577314 GSM4577315
  singlet       7783       2209       4982       3449       2897       3864
  doublet        649        130        439        274        183        297
         
          Huang_GSM6605437 Huang_GSM6605438 Huang_GSM6605439 Huang_GSM6605440
  singlet             6313             5582            10858             5587
  doublet              569              497             1113              412
         
          Huang_GSM7277296 Huang_GSM7277297 Huang_GSM7277298 Lai_GSM5572238
  singlet             5263             9910             6217           1815
  doublet              503             1117              596             86
         
          Lai_GSM5572239 Lai_GSM5572240 MRC_Endo8625698 MRC_Endo8625699
  singlet           2054           3087            3118            1551
  doublet            112            190             213              77
         
          MRC_Endo8712024 MRC_Endo8712032 MRC_Endo8715415 MRC_Endo8715416
  singlet             431             385             404            1430
  doublet              19              14              12              65
         
          Shih_MEtissue1 Shih_MEtissue2 Shih_MEtissue3
  singlet            331            926            889
  doublet             12             40             35

Sketching

set.seed(123)
sketchcells.bybatch <- unlist(lapply(unique(colData(dat)[[batchvar]]), function(b) {
    subd <- dat[, which(colData(dat)[[batchvar]] == b)]
    geneVar <- scran::modelGeneVar(subd)
    subhvgs <- scran::getTopHVGs(geneVar, n = 2000)
    subd <- scater::runPCA(subd, exprs_values = "logcounts", ncomponents = 30, 
                           name = "PCAsub", 
                           subset_row = subhvgs, scale = FALSE)
    idxsub <- geosketch(reducedDim(subd, "PCAsub"), 
                        N = min(ncol(subd), max(0.1 * ncol(subd), 150)), 
                        seed = 123)
    colnames(subd)[idxsub]
}))
dat$sketched.bybatch <- ifelse(colnames(dat) %in% sketchcells.bybatch, TRUE, FALSE)
table(batch = colData(dat)[[batchvar]], sketched.bybatch = dat$sketched.bybatch)
            sketched.bybatch
batch        FALSE  TRUE
  A13        22920  2546
  A30        14102  1566
  E1          4464   495
  E2           699   150
  E3          1720   191
  GSM4577306  1723   191
  GSM4577307  2117   235
  GSM4577308  8845   982
  GSM4577309 17416  1935
  GSM4577310  7589   843
  GSM4577311  2106   233
  GSM4577312  4879   542
  GSM4577313  3351   372
  GSM4577314  2772   308
  GSM4577315  3745   416
  GSM5572238  1711   190
  GSM5572239  1950   216
  GSM5572240  2950   327
  GSM6605437  6194   688
  GSM6605438  5472   607
  GSM6605439 10774  1197
  GSM6605440  5400   599
  GSM7277296  5190   576
  GSM7277297  9925  1102
  GSM7277298  6132   681
  TB13020851   774   150
  TB13021061   237   150
  TB13021372   193   150
  TB13021576   429   150

Find an alternative reduced dimension representation, after sketching

set.seed(123)
## First find a new set of HVGs
dattmp <- dat[, sketchcells.bybatch]
stopifnot(all(rownames(dattmp) == rownames(dat)))
rowData(dat)$scran_geneVar_geosketch <- 
    scran::modelGeneVar(dattmp, block = dattmp[[datasetvar]])
hvgs <- scran::getTopHVGs(rowData(dat)$scran_geneVar_geosketch, n = 3000)
metadata(dat)$hvg.scran.genevar.geosketch <- hvgs

head(
    as.data.frame(
        rowData(dat)$scran_geneVar_geosketch[
            order(rowData(dat)$scran_geneVar_geosketch$bio, 
                  decreasing = TRUE), 
            c("mean", "total", "tech", "bio", "p.value", "FDR")]
    ), 
    25
)
             mean    total      tech       bio       p.value           FDR
GNLY    0.8430368 3.082955 0.4760610 2.6068937  0.000000e+00  0.000000e+00
PAEP    0.6338257 2.269092 0.3586959 1.9103957  0.000000e+00  0.000000e+00
CCL4    0.7156646 2.385446 0.4839192 1.9015270  0.000000e+00  0.000000e+00
HSPA1A  1.8405659 2.554906 0.9156427 1.6392630  4.227595e-45  5.262062e-43
COL3A1  1.3701836 2.108264 0.6047220 1.5035419  3.720120e-75  7.823796e-73
COL1A1  1.2971255 2.054165 0.5880678 1.4660975  1.002319e-76  2.157580e-74
CXCL8   0.8145901 1.844594 0.5457379 1.2988557 1.763084e-123 5.973916e-121
SFRP4   0.7736994 1.669994 0.3779656 1.2920287 4.521651e-107 1.378877e-104
IGFBP7  1.4500679 1.876015 0.7403303 1.1356848  1.664677e-39  1.760613e-37
HLA-DRA 0.5747616 1.530909 0.4216986 1.1092108 8.092654e-216 5.484122e-213
TIMP1   1.1100104 1.788087 0.6934214 1.0946661  5.409529e-27  4.158746e-25
CD74    0.7570072 1.623149 0.5305676 1.0925815 4.206788e-112 1.304604e-109
MT2A    1.0790865 1.703440 0.6173767 1.0860637  3.114396e-52  4.670828e-50
NKG7    0.5250787 1.402792 0.3760315 1.0267602 1.780916e-206 1.086181e-203
CCL3    0.4424003 1.354914 0.3306605 1.0242538 4.530567e-234 3.604165e-231
CCL4L2  0.3739051 1.276560 0.2596221 1.0169381 7.670023e-248 7.386232e-245
LGALS1  2.1602992 2.061316 1.0743581 0.9869577  4.468995e-18  2.412071e-16
WFDC2   0.7853112 1.323889 0.3450149 0.9788741 4.608429e-137 1.720825e-134
APOE    0.7601890 1.441097 0.4837123 0.9573851  1.263635e-53  1.959383e-51
SOD2    1.2907472 1.684880 0.7389731 0.9459069  9.240494e-38  9.289743e-36
TAGLN   0.6784393 1.290123 0.3467882 0.9433345  2.297792e-80  5.321861e-78
SRGN    0.9308689 1.528172 0.5987945 0.9293771  6.784450e-75  1.410626e-72
MT1G    0.5606329 1.151168 0.2301244 0.9210436 4.889999e-235 4.260587e-232
FOS     2.3218867 1.958179 1.0503591 0.9078202  9.190106e-12  3.495870e-10
CCL5    0.4146188 1.154212 0.2747909 0.8794207 4.360188e-192 2.346422e-189
## Next run multibatch PCA on the sketched cells
set.seed(123)
dattmp <- dat[, sketchcells.bybatch]
mbpca <- multiBatchPCA(dattmp, batch = dattmp[[batchvar]], d = 30, 
                       subset.row = metadata(dat)$hvg.scran.genevar.geosketch, 
                       weights = FALSE, preserve.single = TRUE)
projection <- mbpca[[1]]
rotation <- metadata(mbpca)$rotation
centers <- metadata(mbpca)$centers
## Sanity check
tmp <- t(sweep(assay(dattmp[names(centers), ], "logcounts"), MARGIN = 1, 
               STATS = centers, FUN = "-")) %*% rotation
max(abs(tmp - projection))
[1] 1.030287e-13
## Project all cells
pca <- t(sweep(assay(dat[names(centers), ], "logcounts"), MARGIN = 1, 
               STATS = centers, FUN = "-")) %*% rotation
Warning in asMethod(object): sparse->dense coercion: allocating vector of size
3.9 GiB
## Perform batch correction in the PCA space
stopifnot(all(rownames(pca) == colnames(dat)))
reducedDim(dat, "multibatchPCA") <- as.matrix(pca)
mnn <- reducedMNN(as.matrix(pca), batch = dat[[batchvar]], k = 20, 
                  BPPARAM = MulticoreParam(workers = 8L))
stopifnot(all(rownames(mnn) == colnames(dat)))
reducedDim(dat, "fastMNNcorr_sketch") <- mnn$corrected

Dimension reduction

Apply tSNE and UMAP to sketched cells and project the others.

## TSNE
set.seed(123)
tsne <- snifter::fitsne(
    reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"), 
    n_jobs = 8L, random_state = 123L,
    perplexity = 30, n_components = 2L, 
    pca = FALSE)
proj <- snifter::project(
    x = tsne, 
    new = reducedDim(dat, "fastMNNcorr_sketch"), 
    old = reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"))
rownames(proj) <- colnames(dat)
reducedDim(dat, "TSNE_fastMNNcorr_sketch") <- proj

## UMAP
set.seed(123)
umap <- uwot::umap(
    X = reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"), 
    n_components = 2, n_neighbors = 50,
    pca = NULL, scale = FALSE, 
    ret_model = TRUE, n_threads = 8L)
proj <- uwot::umap_transform(
    X = reducedDim(dat, "fastMNNcorr_sketch"), 
    model = umap, 
    n_threads = 8L)
rownames(proj) <- colnames(dat)
reducedDim(dat, "UMAP_fastMNNcorr_sketch") <- proj

Shuffle cells (to avoid plotting in layers by sample later)

set.seed(123)
dat <- dat[, sample(seq_len(ncol(dat)), ncol(dat), replace = FALSE)]

Annotate cells

dat$celltype_annot <- rep(NA, ncol(dat))

# Cells from the epithelial lineage are assigned a cell type label based on the 
# annotation from there. 
epiatlas <- readRDS(epiatlas)
epicells <- intersect(colnames(epiatlas), colnames(dat))
dat$celltype_annot[match(epicells, colnames(dat))] <- 
    as.character(epiatlas$leiden_res0.4_annot)[match(epicells, colnames(epiatlas))]

# For the other lineages, cells that appear in HECA
# are assigned the label from there. 
hecaidx <- which(!is.na(dat$HECA_Celltype) & dat$HECA_Lineage != "Epithelial")
dat$celltype_annot[hecaidx] <- dat$HECA_Celltype[hecaidx]

# Immune cells are assigned the refined immune cell type
dat$celltype_annot[which(dat$celltype_annot %in% 
                             c("Immune_Myeloid", "Immune_Lymphoid"))] <- NA
hecaimmuneidx <- which(!is.na(dat$HECA_CelltypeImmune))
dat$celltype_annot[hecaimmuneidx] <- dat$HECA_CelltypeImmune[hecaimmuneidx]

# Group together some subtypes
dat$celltype_annot[dat$celltype_annot %in% 
                       c("dStromal_early", "dStromal_late", "dStromal_mid")] <- "dStromal"
dat$celltype_annot[dat$celltype_annot %in% c("ePV_1a", "ePV_1b", "ePV_2")] <- "ePV"
dat$celltype_annot[dat$celltype_annot %in% 
                       c("eStromal", "eStromal_cycling", "eStromal_MMPs")] <- "eStromal"

table(dat$celltype_annot, useNA = "ifany")

                Arterial                   B_cell                     cDC1 
                     609                      130                       68 
                    cDC2                 ciliated                  cycling 
                     198                     3595                     6029 
               dHormones                 dStromal                      eM1 
                       1                    24519                      520 
                     eM2                      ePV                 eStromal 
                     589                     3348                    26928 
      Fibroblast_basalis hormone responsive early  hormone responsive late 
                      51                     4865                     3175 
                  HOXA13                     ILC3                    KRT5+ 
                      72                      288                       81 
           luminal early             luminal late                Lymphatic 
                    1274                    10564                       82 
               Mast_cell                 Monocyte                      mPV 
                      28                       73                      509 
                  MUC5B+                      pDC    Peripheral_lymphocyte 
                     152                      103                      288 
           Plasma_B_cell             pre-ciliated           Red_blood_cell 
                      56                      178                       30 
              remodeling          secretory early           secretory late 
                     728                     4296                     1265 
           secretory mid                sHormones               T_cell_CD4 
                   11287                      199                     1202 
              T_cell_CD8           T_cell_cycling                    T_Reg 
                    1475                      155                      126 
transcriptionally active                     uNK1             uNK1_cycling 
                    1400                      509                      333 
                    uNK2                     uNK3                    uSMCs 
                    1037                     1117                      998 
                  Venous                     <NA> 
                    5018                    54019 
# All remaining cells are assigned a label by running SingleR with the 
# previously annotated cells as the reference.
datleft <- dat[, is.na(dat$celltype_annot)]
datref <- dat[, !is.na(dat$celltype_annot)]

set.seed(123)
ref_summed <- aggregateAcrossCells(datref, id = datref$celltype_annot,
                                   use.assay.type = "logcounts",
                                   statistics = "mean")
pred <- SingleR(test = datleft, ref = ref_summed, 
                labels = ref_summed$celltype_annot)
plotScoreHeatmap(pred)

dat$celltype_annot[match(rownames(pred), colnames(dat))] <- pred$labels
table(dat$celltype_annot, useNA = "ifany")

                Arterial                   B_cell                     cDC1 
                    1367                      290                      104 
                    cDC2                 ciliated                  cycling 
                     282                     3604                     6101 
               dHormones                 dStromal                      eM1 
                      19                    33458                      684 
                     eM2                      ePV                 eStromal 
                     774                     5041                    40204 
      Fibroblast_basalis hormone responsive early  hormone responsive late 
                    1037                     5812                     3262 
                  HOXA13                     ILC3                    KRT5+ 
                     354                      323                      103 
           luminal early             luminal late                Lymphatic 
                    1311                    10860                      116 
               Mast_cell                 Monocyte                      mPV 
                      44                      305                     5167 
                  MUC5B+                      pDC    Peripheral_lymphocyte 
                     155                      132                      414 
           Plasma_B_cell             pre-ciliated           Red_blood_cell 
                      63                      186                       49 
              remodeling          secretory early           secretory late 
                     776                     5500                     1359 
           secretory mid                sHormones               T_cell_CD4 
                   11620                     6821                     1860 
              T_cell_CD8           T_cell_cycling                    T_Reg 
                    1812                      249                      186 
transcriptionally active                     uNK1             uNK1_cycling 
                    1565                      785                      531 
                    uNK2                     uNK3                    uSMCs 
                    1486                     1363                     5957 
                  Venous 
                   10076 
# Exclude cells that are not assigned a cluster label
table(is.na(dat$celltype_annot))

 FALSE 
173567 
dat <- dat[, !is.na(dat$celltype_annot)]

leiden_res0.4_annot_cols <- c(
    "pre-ciliated" = "#AEC7E8",
    "ciliated" = "#1F77B4",
    "secretory early" = "#FF9896",
    "secretory mid" = "#D62728",
    "secretory late" = "#C49C94",
    "luminal early" = "#C5B0D5",
    "luminal late" = "#9467BD",
    "transcriptionally active" = "#FF7F0E",
    "MUC5B+" = "#2CA02C",
    "KRT5+" = "#8C564B",
    "remodeling" = "grey20",
    "cycling" = "#FFBB78",
    "hormone responsive early" = "#F7B6D2",
    "hormone responsive late" = "#E377C2"
)

Dot plots of clusters and marker genes of interest

Marker genes from HECA paper (https://www.nature.com/articles/s41588-024-01873-w, Fig 2a).

col = circlize::colorRamp2(
    breaks = seq(0, 7.5, length.out = 64),
    colors = colorRampPalette(c("grey90", "purple"))(64))

markers <- intersect(
    c("PAX8", "EPCAM", "FOXA2", "MMP7", "SOX9", "AXIN2", "TRH", "ALDH1A1",
      "CDH2", "KLK11", "SLC7A11", "DKK1", "PHLDA1", "KMO", "IHH", "EMID1", 
      "HPRT1", "SUFU", "OPRK1", "CBR3", "S100P", "SCGB2A2", "ABCG1", "PAEP", 
      "DPP4", "GPX3", "PTPRR", "FGF7", "FXYD2", "IL32", "WNT7A", "LGR5", 
      "MMP7", "TNF", "VTCN1", "CLDN22", "SULT1E1", "LEFTY1", "PTGS1", "IL6",
      "CCNO", "CDC20B", "MUC12", "PIFO", "FOXJ1", "TP73", "BPIFB1", "TFF3", 
      "MUC5B", "SAA1", "TP63", "SNCG", "KRT5", "PGR", "AR", "ESR1"), 
    rownames(dat))
plotDots(dat, features = markers,
         group = "celltype_annot", 
         swap_rownames = "gene_name", zlim = c(0, 7.5),
         colour = col(seq(0, 7.5, length.out = 64))) + 
    scale_size(limits=c(0.2, 1)) + 
    labs(x = "") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Plot dimension reductions

plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
               other_fields = "HECA_Celltype", point_size = 0.5) + 
    geom_point(aes(color = HECA_Celltype), size = 0.5) + 
    guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

for (vrb in c("RCA_Celltype", "RCA_Location", "RCA_BiopsyType", 
              "matchDataset", "StageCoarse", "Stage", "donorCol", 
              "scDblFinder.class", "SeuratCC", "celltype_annot")) {
    print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
                         colour_by = vrb, point_size = 0.5) + 
              guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))))
}

print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
                     colour_by = "celltype_annot", point_size = 0.5) + 
          scale_color_manual(values = leiden_res0.4_annot_cols, name = "") + 
          guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))))
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

for (vrb in c("subsets_MT_percent", "detected", "intronic_fraction")) {
    print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
                         colour_by = vrb, point_size = 0.5))
}

Plot cluster abundance over the cycle

col <- "celltype_annot"

df <- data.frame(colData(dat)[, c("StageFactor", col)]) |>
    group_by(StageFactor, .data[[col]]) |>
    tally() |>
    group_by(StageFactor) |>
    mutate(perc = n / sum(n) * 100) |>
    group_by(.data[[col]]) |>
    mutate(scaledperc = perc / sum(perc)) |>
    ungroup() |>
    dplyr::filter(!is.na(StageFactor))
ggplot(df, aes(x = StageFactor, y = perc, color = .data[[col]])) + 
    geom_point() + 
    geom_line(aes(x = as.numeric(StageFactor)), linewidth = 1) + 
    theme_cowplot() + 
    labs(x = "", y = "Relative abundance (%)",
         title = col) + 
    theme(legend.position = "bottom") + 
    scale_color_manual(values = leiden_res0.4_annot_cols, name = "")

## One panel per cluster
ggplot(df, aes(x = StageFactor, y = perc, color = .data[[col]])) + 
    geom_point() + 
    geom_line(aes(x = as.numeric(StageFactor)), linewidth = 1) + 
    theme_cowplot() + 
    labs(x = "", y = "Relative abundance (%)",
         title = col) + 
    facet_wrap(~ .data[[col]], scales = "free_y") + 
    scale_color_manual(values = leiden_res0.4_annot_cols, name = "") + 
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          strip.text = element_text(size = 10))

Find marker genes

First, we compare all clusters in a pairwise manner. The comparison is only performed for genes with a non-zero count in at least 0.05% of the cells (corresponding to 87 of the 173567 cells in the data set). These gene IDs are also written to a file, for potential use as the ‘universe’ in over-representation analyses.

frac_detected <- rowMeans(assay(dat, "counts") > 0)
genes_to_test <- rownames(dat)[frac_detected >= (0.05 / 100)]
write.table(genes_to_test, file = "epithelial_stromal_cell_atlas_gene_universe.txt", 
            row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")

clc <- "celltype_annot"
ptts <- pairwiseTTests(dat, groups = dat[[clc]], 
                       block = dat[[blockvar]], direction = "up", 
                       subset.row = genes_to_test)
markers <- combineMarkers(ptts$statistics, ptts$pairs, pval.type = "all")

## Write results to files
ttestdir <- paste0("pairwise_ttests_", clc)
if (!file.exists(ttestdir)) {
    dir.create(ttestdir, recursive = TRUE)
}
for (i in seq_len(nrow(ptts$pairs))) {
    df <- cbind(gene = rownames(ptts$statistics[[i]]), 
                ptts$statistics[[i]])
    df <- df[order(df$p.value, -df$logFC), ]
    filename <- paste0(clc, "_cluster_", ptts$pairs$first[i],
                       "_vs_", ptts$pairs$second[i], ".txt")
    write.table(df, file = file.path(ttestdir, filename), quote = FALSE,
                sep = "\t", row.names = FALSE, col.names = TRUE)
}

markerdir <- paste0("marker_genes_all_", clc)
if (!file.exists(markerdir)) {
    dir.create(markerdir, recursive = TRUE)
}
for (i in seq_along(markers)) {
    df <- cbind(gene = rownames(markers[[i]]), markers[[i]])
    filename <- paste0(clc, "_cluster_", names(markers)[i], "_markers.txt")
    write.table(df, file = file.path(markerdir, filename), quote = FALSE,
                sep = "\t", row.names = FALSE, col.names = TRUE)
}

Save object

saveRDS(dat, file = "invivo_cell_atlas.rds")

Session info

sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Rocky Linux 8.10 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Zurich
tzcode source: system (glibc)

attached base packages:
 [1] parallel  grid      stats4    stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] SingleR_2.8.0               DT_0.33                    
 [3] igraph_2.1.4                BiocParallel_1.40.2        
 [5] bluster_1.16.0              org.Hs.eg.db_3.20.0        
 [7] AnnotationDbi_1.68.0        tricycle_1.14.0            
 [9] sketchR_1.2.0               Seurat_5.2.1               
[11] SeuratObject_5.0.2          sp_2.2-0                   
[13] scDblFinder_1.20.2          circlize_0.4.16            
[15] ComplexHeatmap_2.22.0       cowplot_1.1.3              
[17] batchelor_1.22.0            msigdbr_10.0.2             
[19] tidyr_1.3.1                 dplyr_1.1.4                
[21] scran_1.34.0                scater_1.34.1              
[23] ggplot2_3.5.2               scuttle_1.16.0             
[25] SingleCellExperiment_1.28.1 SummarizedExperiment_1.36.0
[27] Biobase_2.66.0              GenomicRanges_1.58.0       
[29] GenomeInfoDb_1.42.3         IRanges_2.40.1             
[31] S4Vectors_0.44.0            BiocGenerics_0.52.0        
[33] MatrixGenerics_1.18.1       matrixStats_1.5.0          
[35] fishpond_2.12.0            

loaded via a namespace (and not attached):
  [1] spatstat.sparse_3.1-0     bitops_1.0-9             
  [3] httr_1.4.7                RColorBrewer_1.1-3       
  [5] doParallel_1.0.17         tools_4.4.2              
  [7] sctransform_0.4.1         R6_2.6.1                 
  [9] ResidualMatrix_1.16.0     lazyeval_0.2.2           
 [11] uwot_0.2.3                GetoptLong_1.0.5         
 [13] withr_3.0.2               gridExtra_2.3            
 [15] progressr_0.15.1          cli_3.6.4                
 [17] Cairo_1.6-2               spatstat.explore_3.4-2   
 [19] fastDummies_1.7.5         labeling_0.4.3           
 [21] sass_0.4.10               mvtnorm_1.3-3            
 [23] spatstat.data_3.1-6       readr_2.1.5              
 [25] ggridges_0.5.6            pbapply_1.7-2            
 [27] Rsamtools_2.22.0          dichromat_2.0-0.1        
 [29] parallelly_1.43.0         limma_3.62.2             
 [31] RSQLite_2.3.9             circular_0.5-1           
 [33] generics_0.1.3            shape_1.4.6.1            
 [35] BiocIO_1.16.0             crosstalk_1.2.1          
 [37] gtools_3.9.5              ica_1.0-3                
 [39] spatstat.random_3.3-3     Matrix_1.7-1             
 [41] ggbeeswarm_0.7.2          abind_1.4-8              
 [43] lifecycle_1.0.4           yaml_2.3.10              
 [45] edgeR_4.4.2               SparseArray_1.6.2        
 [47] Rtsne_0.17                blob_1.2.4               
 [49] promises_1.3.2            dqrng_0.4.1              
 [51] crayon_1.5.3              dir.expiry_1.14.0        
 [53] miniUI_0.1.2              lattice_0.22-6           
 [55] beachmat_2.22.0           KEGGREST_1.46.0          
 [57] magick_2.8.6              pillar_1.10.2            
 [59] knitr_1.50                metapod_1.14.0           
 [61] boot_1.3-31               rjson_0.2.23             
 [63] xgboost_1.7.9.1           future.apply_1.11.3      
 [65] codetools_0.2-20          glue_1.8.0               
 [67] spatstat.univar_3.1-2     data.table_1.17.0        
 [69] vctrs_0.6.5               png_0.1-8                
 [71] spam_2.11-1               gtable_0.3.6             
 [73] assertthat_0.2.1          cachem_1.1.0             
 [75] xfun_0.52                 S4Arrays_1.6.0           
 [77] mime_0.13                 survival_3.7-0           
 [79] pheatmap_1.0.12           iterators_1.0.14         
 [81] statmod_1.5.0             fitdistrplus_1.2-2       
 [83] ROCR_1.0-11               nlme_3.1-166             
 [85] bit64_4.6.0-1             filelock_1.0.3           
 [87] RcppAnnoy_0.0.22          bslib_0.9.0              
 [89] irlba_2.3.5.1             vipor_0.4.7              
 [91] KernSmooth_2.23-24        DBI_1.2.3                
 [93] colorspace_2.1-1          tidyselect_1.2.1         
 [95] bit_4.6.0                 compiler_4.4.2           
 [97] curl_6.2.2                BiocNeighbors_2.0.1      
 [99] basilisk.utils_1.18.0     DelayedArray_0.32.0      
[101] plotly_4.10.4             msigdbdf_24.1.0          
[103] rtracklayer_1.66.0        scales_1.3.0.9000        
[105] lmtest_0.9-40             stringr_1.5.1            
[107] digest_0.6.37             goftest_1.2-3            
[109] spatstat.utils_3.1-3      rmarkdown_2.29           
[111] basilisk_1.18.0           XVector_0.46.0           
[113] htmltools_0.5.8.1         pkgconfig_2.0.3          
[115] sparseMatrixStats_1.18.0  fastmap_1.2.0            
[117] rlang_1.1.6               GlobalOptions_0.1.2      
[119] htmlwidgets_1.6.4         UCSC.utils_1.2.0         
[121] shiny_1.10.0              DelayedMatrixStats_1.28.1
[123] jquerylib_0.1.4           farver_2.1.2             
[125] zoo_1.8-14                jsonlite_2.0.0           
[127] BiocSingular_1.22.0       RCurl_1.98-1.17          
[129] magrittr_2.0.3            GenomeInfoDbData_1.2.13  
[131] dotCall64_1.2             patchwork_1.3.0          
[133] Rcpp_1.0.14               ggnewscale_0.5.1         
[135] babelgene_22.9            viridis_0.6.5            
[137] reticulate_1.42.0         stringi_1.8.7            
[139] zlibbioc_1.52.0           MASS_7.3-61              
[141] plyr_1.8.9                listenv_0.9.1            
[143] ggrepel_0.9.6             deldir_2.0-4             
[145] Biostrings_2.74.1         splines_4.4.2            
[147] tensor_1.5                hms_1.1.3                
[149] locfit_1.5-9.12           spatstat.geom_3.3-6      
[151] RcppHNSW_0.6.0            reshape2_1.4.4           
[153] ScaledMatrix_1.14.0       XML_3.99-0.18            
[155] evaluate_1.0.3            snifter_1.16.0           
[157] tzdb_0.5.0                foreach_1.5.2            
[159] httpuv_1.6.16             RANN_2.6.2               
[161] purrr_1.0.4               polyclip_1.10-7          
[163] future_1.40.0             clue_0.3-66              
[165] scattermore_1.2           rsvd_1.0.5               
[167] xtable_1.8-4              restfulr_0.0.15          
[169] svMisc_1.4.3              RSpectra_0.16-2          
[171] later_1.4.2               viridisLite_0.4.2        
[173] tibble_3.2.1              memoise_2.0.1            
[175] beeswarm_0.4.0            GenomicAlignments_1.42.0 
[177] cluster_2.1.6             globals_0.17.0