Generating an epithelial endometrial atlas

Published

March 3, 2025

Load packages

suppressPackageStartupMessages({
    library(fishpond)
    library(SummarizedExperiment)
    library(SingleCellExperiment)
    library(scuttle)
    library(scater)
    library(scran)
    library(dplyr)
    library(tidyr)
    library(ggplot2)
    library(msigdbr)
    library(batchelor)
    library(cowplot)
    library(ComplexHeatmap)
    library(circlize)
    library(scDblFinder)
    library(Seurat)
    library(sketchR)
    library(tricycle)
    library(org.Hs.eg.db)
    library(parallel)
    library(bluster)
    library(BiocParallel)
    library(igraph)
    library(DT)
})

Define parameters and reference files

samples <- c("4861STDY7387181", "4861STDY7387182", "4861STDY7387183", 
             "4861STDY7771115", "4861STDY7771123", "MRC_Endo8625698", 
             "MRC_Endo8625699", "MRC_Endo8712024", "MRC_Endo8712032", 
             "MRC_Endo8715415", "MRC_Endo8715416", "GSM4577306", "GSM4577307", 
             "GSM4577308", "GSM4577309", "GSM4577310", "GSM4577311", 
             "GSM4577312", "GSM4577313", "GSM4577314", "GSM4577315", 
             "Shih_MEtissue1", "Shih_MEtissue2", "Shih_MEtissue3", 
             "Shih_MEtissue4", "Shih_MEtissue5", "Shih_MEtissue6", 
             "Shih_MEtissue7", "Huang_GSM6605437", "Huang_GSM6605438", 
             "Huang_GSM6605440", "Huang_GSM6605439", "Huang_GSM7277296", 
             "Huang_GSM7277297", "Huang_GSM7277298", "Lai_GSM5572238", 
             "Lai_GSM5572239", "Lai_GSM5572240")
blockvar <- "sample"           ## used as blocking factor in doublet annotation
batchvar <- "donorCol"         ## used for batch correction
datasetvar <- "matchDataset"   ## used as blocking factor in HVG annotation

## Sample annotation
sampleannot <- "invivo_atlas_samples.txt"

## Gene annotation (derived from GENCODE v38 gtf file)
t2g <- "reference/gencode.v38.annotation.tx2gene_full.rds"

## Reproductive cell atlas data (downloaded from 
## https://www.reproductivecellatlas.org/ and converted to SingleCellExperiment 
## format)
rca <- "data/reproductivecellatlas/2021/endometrium_all.rds"
heca <- "data/reproductivecellatlas/2023/endometriumAtlasV2_cells_with_counts.rds"

## Cell cycle marker genes
## https://github.com/ventolab/HECA-Human-Endometrial-Cell-Atlas/tree/main/utils
s_genes <- "data/reproductivecellatlas/2023/S_genes_HECA.tsv"
g2m_genes <- "data/reproductivecellatlas/2023/G2M_genes_HECA.tsv"

## Shih et al cell annotations (obtained from 
## https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE203191)
shih_cellannot <- "data/GSE203191/GSE203191_Shih_endo_meta_frame.tsv"

Assemble gene annotation

geneannot <- readRDS(t2g) |>
    dplyr::select(gene_id, seqnames, gene_type, gene_name) |>
    dplyr::rename(chromosome = seqnames) |>
    dplyr::distinct()

## Annotate ribosomal protein genes
## GO term "structural constituent of ribosome" (GO:0003735)
golist <- as.list(org.Hs.egGO2ALLEGS)
ribo_entrez <- unique(golist[["GO:0003735"]])
length(ribo_entrez)
[1] 175
tmp <- select(org.Hs.eg.db, keys = ribo_entrez, columns = "ENSEMBL", 
              keytype = "ENTREZID")
'select()' returned 1:many mapping between keys and columns
ribo_ensembl <- unique(na.omit(tmp[, "ENSEMBL"]))
length(ribo_ensembl)
[1] 195
summary(ribo_ensembl %in% sub("\\.[0-9]+$", "", geneannot$gene_id))
   Mode   FALSE    TRUE 
logical      33     162 
geneannot$is_ribosomal_protein <- 
    sub("\\.[0-9]+$", "", geneannot$gene_id) %in% ribo_ensembl
geneannot$gene_type_ribo <- ifelse(
    geneannot$gene_type == "protein_coding",
    ifelse(geneannot$is_ribosomal_protein, "protein_coding_ribosome",
           geneannot$gene_type),
    geneannot$gene_type)

Read data

## Get sample IDs
tbl <- read.delim(sampleannot) |>
     dplyr::filter(SampleName %in% samples)
DT::datatable(tbl, options = list(scrollX = TRUE, pageLength = 50))
sns <- unique(tbl$SampleName)
names(sns) <- sns

## Read alevin-fry data and combine
dat <- do.call(cbind, lapply(sns, function(s) {
    x <- fishpond::loadFry(
        fryDir = file.path("data", "alevin_fry_splici", s), 
        outputFormat = list(counts = c("S", "A"), spliced = c("S", "A"),
                            unspliced = c("U"), total = c("U", "S", "A")))
    rowData(x)$gene_name <- 
        geneannot$gene_name[match(rowData(x)$gene_ids, geneannot$gene_id)]
    rowData(x)$chromosome <- 
        geneannot$chromosome[match(rowData(x)$gene_ids, geneannot$gene_id)]
    rowData(x)$gene_type <- 
        geneannot$gene_type[match(rowData(x)$gene_ids, geneannot$gene_id)]
    rowData(x)$gene_type_ribo <- 
        geneannot$gene_type_ribo[match(rowData(x)$gene_ids, geneannot$gene_id)]
    
    ## Keep only cells with at least 1,000 detected genes
    x <- x[, colSums(assay(x, "total") > 0) > 1000]
    
    ## Add sample ID to colnames
    x$sample <- s
    colnames(x) <- paste0(s, "_", colnames(x))
    x
}))
dat
class: SingleCellExperiment 
dim: 60649 274348 
metadata(0):
assays(4): counts spliced unspliced total
rownames(60649): ENSG00000223972.5 ENSG00000243485.5 ...
  ENSG00000210194.1 ENSG00000210196.2
rowData names(5): gene_ids gene_name chromosome gene_type
  gene_type_ribo
colnames(274348): 4861STDY7387181_AGAGTGGGTCTAGCGC
  4861STDY7387181_TAAGAGATCTGAGTGT ...
  Huang_GSM7277298_CTATCCGTCTTTCGAT Huang_GSM7277298_GGGTTTACAGAACTTC
colData names(2): barcodes sample
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
## Set rownames to gene names where possible
rownames(dat) <- scater::uniquifyFeatureNames(
    ID = rownames(dat), 
    names = rowData(dat)$gene_name
)

Add sample annotations

## Extract barcodes corresponding to control samples in Shih data and remove 
## all other barcodes from the Shih samples
cb <- read.delim(shih_cellannot) |>
    dplyr::filter(pheno == "Control") |>
    dplyr::mutate(barcode = gsub("-[0-9]$", "", barcode)) |>
    dplyr::mutate(sample_cb = paste("Shih", run, barcode, sep = "_")) |>
    dplyr::pull(sample_cb)
to_remove <- which(grepl("Shih", dat$sample) & 
                       !paste0(dat$sample, "_", dat$barcodes) %in% cb)
dat <- dat[, -to_remove]
dim(dat)
[1]  60649 253579
## Add 'cell' column
dat$cell <- colnames(dat)

## Any other provided annotations
tbl2 <- tbl |>
    dplyr::filter(RunID != "Shih")
if (nrow(tbl2) > 0 && (length(tbl2$SampleName) == length(unique(tbl2$SampleName)))) {
    for (cn in colnames(tbl2)) {
        if (!all(is.na(tbl2[[cn]])) && !(cn %in% colnames(colData(dat)))) {
            if (cn == "Day") {
                colData(dat)[[cn]] <- paste0(
                    "D", tbl2[[cn]][match(dat$sample, tbl2$SampleName)])
            } else if (cn == "RunID") {
                colData(dat)[[cn]] <- sub("_[0-9]+$", "", 
                                          tbl2[[cn]][match(dat$sample, 
                                                           tbl2$SampleName)])
            } else {
                colData(dat)[[cn]] <- tbl2[[cn]][match(dat$sample, tbl2$SampleName)]
            }
        }
    }
}

## Harmonize annotation values
dat$Stage <- gsub(" stage", "", dat$Stage)

## Add information for the Shih data set
shih_annot <- read.delim(shih_cellannot) |>
    dplyr::mutate(cell = paste0("Shih_", run, "_", gsub("-[0-9]$", "", barcode)))
if (any(dat$cell %in% shih_annot$cell)) {
    for (cn in c("clusterID", "pheno", "Phase", "subjectID", "NKsubclusterID",
                 "StromalSubclusterID")) {
        colData(dat)[[paste0("Shih_", cn)]] <- 
            shih_annot[[cn]][match(dat$cell, shih_annot$cell)]
    }
}
dat$RunID[grep("Shih", dat$sample)] <- "Shih"
dat$Stage[grep("Shih", dat$sample)] <- "menstrual"

dat$StageFactor <- factor(dat$Stage, 
                          levels = c("menstrual", "proliferative",
                                     "early-secretory", "early-mid-secretory",
                                     "mid-secretory", "late-secretory"))
dat$StageCoarse <- sub("early-|early-mid-|mid-|late-", "", dat$Stage)

## Create a 'donorCol' column
dat$donorCol <- NA_character_
if ("DonorID" %in% colnames(colData(dat))) {
    idx <- which(!is.na(dat[["DonorID"]]))
    dat$donorCol[idx] <- dat[["DonorID"]][idx]
}
if ("Shih_subjectID" %in% colnames(colData(dat))) {
    idx <- which(!is.na(dat[["Shih_subjectID"]]))
    dat$donorCol[idx] <- dat[["Shih_subjectID"]][idx]
}

## Create a 'sampleCol' column (either sample or Shih_subjectID)
dat$sampleCol <- NA_character_
if ("sample" %in% colnames(colData(dat))) {
    idx <- which(!is.na(dat[["sample"]]))
    dat$sampleCol[idx] <- dat[["sample"]][idx]
}
if ("Shih_subjectID" %in% colnames(colData(dat))) {
    idx <- which(!is.na(dat[["Shih_subjectID"]]))
    dat$sampleCol[idx] <- dat[["Shih_subjectID"]][idx]
}

## Generate columns to match to HECA
dat$matchDataset <- ifelse(dat$RunID %in% c("Wang", "Shih", "Lai", "Huang"), 
                           dat$RunID, "GarciaAlonso")
dat$matchCell <- sub("^Huang_|^Lai_|^Shih_", "", dat$cell)
dat$matchId <- paste0(dat$matchCell, "-", dat$matchDataset)

## Add annotations from the reproductive cell atlas (2021)
rca <- readRDS(rca)
table(dat$matchDataset, colnames(dat) %in% colnames(rca), useNA = "ifany")
              
               FALSE  TRUE
  GarciaAlonso 28396 40337
  Huang        63190     0
  Lai          18499     0
  Shih          3110     0
  Wang         47223 52824
dat$RCA_CellCyclePhase <- rca$`CellCycle Phase`[match(colnames(dat), colnames(rca))]
dat$RCA_Celltype <- rca$`Cell type`[match(colnames(dat), colnames(rca))]
dat$RCA_percentmito <- rca$percent_mito[match(colnames(dat), colnames(rca))]
dat$RCA_ngenes <- rca$n_genes[match(colnames(dat), colnames(rca))]
dat$RCA_BiopsyType <- rca$BiopsyType[match(colnames(dat), colnames(rca))]
dat$RCA_Location <- rca$Location[match(colnames(dat), colnames(rca))]
dat$RCA_BroadCelltype <- rca$`Broad cell type`[match(colnames(dat), colnames(rca))]

## Add annotations from HECA (2023)
heca <- readRDS(heca)
table(dat$matchDataset, dat$matchId %in% colnames(heca), useNA = "ifany")
              
               FALSE  TRUE
  GarciaAlonso 37041 31692
  Huang        34420 28770
  Lai           6227 12272
  Shih          3110     0
  Wang         46329 53718
dat$HECA_CellCyclePhase <- as.character(heca$phase[match(dat$matchId, colnames(heca))])
dat$HECA_Celltype <- as.character(heca$celltype[match(dat$matchId, colnames(heca))])
dat$HECA_ngenes <- heca$n_genes[match(dat$matchId, colnames(heca))]
dat$HECA_percentmito <- heca$percent_mito[match(dat$matchId, colnames(heca))]
dat$HECA_ScrubletScore <- heca$scrublet_score[match(dat$matchId, colnames(heca))]
dat$HECA_Stage <- as.character(heca$Stage[match(dat$matchId, colnames(heca))])
dat$HECA_Lineage <- as.character(heca$lineage[match(dat$matchId, colnames(heca))])
dat$HECA_Label <- as.character(heca$label_long[match(dat$matchId, colnames(heca))])

## Filter to only keep cells that have a value for the batch variable
(dat <- dat[, !is.na(dat[[batchvar]])])
class: SingleCellExperiment 
dim: 60649 253579 
metadata(0):
assays(4): counts spliced unspliced total
rownames(60649): DDX11L1 MIR1302-2HG ... MT-TE MT-TP
rowData names(5): gene_ids gene_name chromosome gene_type
  gene_type_ribo
colnames(253579): 4861STDY7387181_AGAGTGGGTCTAGCGC
  4861STDY7387181_TAAGAGATCTGAGTGT ...
  Huang_GSM7277298_CTATCCGTCTTTCGAT Huang_GSM7277298_GGGTTTACAGAACTTC
colData names(49): barcodes sample ... HECA_Lineage HECA_Label
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
## Sample annotation summary
DT::datatable(as.data.frame(colData(dat)) |>
                  dplyr::select(sample, SampleType, DonorID, SampleName, RunID,
                                ReadLength1, ReadLength2, Tissue, Stage,
                                Digestion, Protocol, Source, Sex, 
                                TimeInMenstrualCycle, DiseaseState, 
                                Fecundity, Phenotype, DevStage,
                                Shih_pheno, Shih_subjectID, 
                                StageFactor, StageCoarse, donorCol, sampleCol,
                                matchDataset) |>
                  dplyr::distinct(), 
              rownames = FALSE,
              options = list(scrollX = TRUE, pageLength = 50))

Extract only epithelial cells for further analysis

We perform a basic pre-analysis with the aim to cluster the cells and extract the ones that are likely of epithelial origin, for further processing.

set.seed(123)

## Normalize
dat0 <- scuttle::computeLibraryFactors(dat)
dat0 <- batchelor::multiBatchNorm(dat0, batch = dat0[[batchvar]], min.mean = 0.1)

## Find HVGs
scran_geneVar <- scran::modelGeneVar(dat0, block = dat0[[datasetvar]])
hvg.scran.genevar <- scran::getTopHVGs(scran_geneVar, n = 3000)

## Batch correction and dimension reduction
mnn.out <- fastMNN(dat0, batch = dat0[[batchvar]], d = 50, k = 20, 
                   subset.row = hvg.scran.genevar)
stopifnot(all(colnames(mnn.out) == colnames(dat0)))
reducedDim(dat0, "fastMNNcorr") <- reducedDim(mnn.out, "corrected")

dat0 <- scater::runUMAP(dat0, dimred = "fastMNNcorr", 
                        name = "UMAP_fastMNNcorr", n_neighbors = 50)
plotReducedDim(dat0, dimred = "UMAP_fastMNNcorr", colour_by = "HECA_Lineage", 
               point_size = 0.5) + 
    guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

## Cluster the UMAP and select the epithelial cluster
grph <- bluster::makeSNNGraph(reducedDim(dat0, "UMAP_fastMNNcorr"), 
                              k = 20, type = "rank", 
                              BPPARAM = MulticoreParam(workers = 8L))
clst <- igraph::cluster_louvain(grph, resolution = 0.001)
dat0$louvain <- as.character(clst$membership)
plotReducedDim(dat0, dimred = "UMAP_fastMNNcorr", colour_by = "louvain", 
               point_size = 0.5) + 
    guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

tbl <- table(dat0$louvain, dat0$HECA_Lineage)
tbl <- sweep(tbl, MARGIN = 1, STATS = rowSums(tbl), FUN = "/")
tbl
   
     Endothelial   Epithelial       Immune  Mesenchymal
  1 9.593016e-05 9.988488e-01 9.593016e-04 9.593016e-05
  2 4.684280e-05 3.278996e-04 2.451440e-03 9.971738e-01
  3 9.988607e-01 4.882812e-04 1.627604e-04 4.882812e-04
  4 0.000000e+00 0.000000e+00 3.020236e-04 9.996980e-01
  5 0.000000e+00 0.000000e+00 5.773672e-04 9.994226e-01
  6 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
  7 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
## epithelial
(keepclust_epithelial <- rownames(tbl)[tbl[, "Epithelial"] > 0.9])
[1] "1"
dat0$keep_epithelial <- dat0$louvain %in% keepclust_epithelial
table(dat0$keep_epithelial)

 FALSE   TRUE 
168225  85354 
scater::plotReducedDim(dat0, dimred = "UMAP_fastMNNcorr", 
                       colour_by = "keep_epithelial", point_size = 0.5) + 
    guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

## Subset the original data set to the epithelial cells, and delete the 
## temporary dataset used above
dat0 <- dat0[, which(dat0$keep_epithelial)]
dim(dat0)
[1] 60649 85354
stopifnot(colnames(dat0) %in% colnames(dat))
dat <- dat[, colnames(dat0)]
rm(dat0)
dat
class: SingleCellExperiment 
dim: 60649 85354 
metadata(0):
assays(4): counts spliced unspliced total
rownames(60649): DDX11L1 MIR1302-2HG ... MT-TE MT-TP
rowData names(5): gene_ids gene_name chromosome gene_type
  gene_type_ribo
colnames(85354): 4861STDY7387181_AGAGTGGGTCTAGCGC
  4861STDY7387181_CATCAAGTCCTGCTTG ...
  Huang_GSM7277298_ATTTACCGTGGTTTGT Huang_GSM7277298_AATGGCTAGACACACG
colData names(49): barcodes sample ... HECA_Lineage HECA_Label
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

Check total count for each gene type and removed unwanted gene types

keepGeneTypes <- c("protein_coding", "IG_V_gene", "IG_C_gene", "IG_J_gene",
                   "TR_C_gene", "TR_J_gene", "TR_V_gene", "TR_D_gene",
                   "IG_D_gene", "Mt_tRNA", "Mt_rRNA")

pd <- data.frame(gene = rownames(dat), 
                 gene_type_ribo = rowData(dat)$gene_type_ribo, 
                 total_count = rowSums(counts(dat))) |>
    group_by(gene_type_ribo) |>
    summarize(nbr_genes = length(gene), 
              total_count = sum(total_count)) |>
    mutate(keep = gene_type_ribo %in% keepGeneTypes) |>
    arrange(desc(nbr_genes))
pd$gene_type_ribo <- factor(pd$gene_type_ribo, levels = unique(pd$gene_type_ribo))

p1 <- ggplot(pd, aes(x = gene_type_ribo, y = nbr_genes, fill = keep)) +
    geom_col() +
    labs(x = "", y = "Number of genes", fill = "keep") +
    theme_cowplot() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- ggplot(pd, aes(x = gene_type_ribo, y = total_count, fill = keep)) +
    geom_col() +
    labs(x = "", y = "Total UMI count", fill = "keep") +
    theme_cowplot() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot_grid(p1, p2, ncol = 1)

dat <- dat[rowData(dat)$gene_type_ribo %in% keepGeneTypes, ]
dim(dat)
[1] 20231 85354

Add QC metrics

(MT <- grep("^MT-", rownames(dat), value = TRUE))
 [1] "MT-TF"   "MT-RNR1" "MT-TV"   "MT-RNR2" "MT-TL1"  "MT-ND1"  "MT-TI"  
 [8] "MT-TM"   "MT-ND2"  "MT-TW"   "MT-CO1"  "MT-TD"   "MT-CO2"  "MT-TK"  
[15] "MT-ATP8" "MT-ATP6" "MT-CO3"  "MT-TG"   "MT-ND3"  "MT-TR"   "MT-ND4L"
[22] "MT-ND4"  "MT-TH"   "MT-TS2"  "MT-TL2"  "MT-ND5"  "MT-CYB"  "MT-TT"  
[29] "MT-TQ"   "MT-TA"   "MT-TN"   "MT-TC"   "MT-TY"   "MT-TS1"  "MT-ND6" 
[36] "MT-TE"   "MT-TP"  
dat <- scuttle::addPerCellQC(dat, assay.type = "counts", subsets = list(MT = MT))
dat <- scuttle::addPerFeatureQC(dat, assay.type = "counts")
dat$intronic_fraction <- 
    colSums(assay(dat, "unspliced")) / colSums(assay(dat, "total"))
rowData(dat)$intronic_fraction <- 
    rowSums(assay(dat, "unspliced")) / rowSums(assay(dat, "total"))

QC plots

total_count_threshold <- 1000
detected_threshold <- 500
mt_percent_threshold <- 25

dat$keep <- (dat$total > total_count_threshold) & 
    (dat$detected > detected_threshold) & 
    (dat$subsets_MT_percent < mt_percent_threshold)
table(dat$keep)

FALSE  TRUE 
35765 49589 
cdplot <- as.data.frame(colData(dat))

sid <- "sample"
stid <- "StageFactor"

ggplot(cdplot, aes(x = .data[[sid]], y = sum, fill = .data[[stid]])) +
    geom_hline(yintercept = total_count_threshold) + 
    geom_violin(scale = "width") + theme_minimal() +
    labs(y = "Total UMI count per cell") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(cdplot, aes(x = .data[[sid]], y = sum, fill = .data[[stid]])) +
    geom_hline(yintercept = total_count_threshold) + 
    geom_violin(scale = "width") + theme_minimal() +
    scale_y_log10() + 
    labs(y = "Total UMI count per cell") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(cdplot, aes(x = .data[[sid]], y = detected, fill = .data[[stid]])) +
    geom_hline(yintercept = detected_threshold) + 
    geom_violin(scale = "width") + theme_minimal() +
    labs(y = "Number of detected genes per cell") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(cdplot, aes(x = .data[[sid]], y = subsets_MT_percent, fill = .data[[stid]])) +
    geom_hline(yintercept = mt_percent_threshold) + 
    geom_violin(scale = "width") + theme_minimal() +
    labs(y = "Percentage of MT count per cell") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(cdplot, aes(x = detected, y = subsets_MT_percent, color = keep)) + 
    geom_point(size = 0.5, alpha = 0.5) + theme_minimal() + 
    labs(x = "Number of detected genes per cell", 
         y = "Percentage of MT count per cell")

ggplot(cdplot, aes(x = detected, y = intronic_fraction, color = keep)) + 
    geom_point(size = 0.5, alpha = 0.5) + theme_minimal() + 
    labs(x = "Number of detected genes per cell",
         y = "Intronic fraction per cell")

Filter and remove samples with too few cells

(tbl <- table(sample = dat$sample, keep = dat$keep))
                  keep
sample             FALSE  TRUE
  4861STDY7387181    474   415
  4861STDY7387182    454   393
  4861STDY7387183     10     3
  4861STDY7771115     44  3946
  4861STDY7771123     12   822
  GSM4577306        1506   791
  GSM4577307        2427  2142
  GSM4577308        1947  4972
  GSM4577309       12015 14387
  GSM4577310        2805  2512
  GSM4577311        2717  2171
  GSM4577312        1099  2595
  GSM4577313          85   172
  GSM4577314        1749  1219
  GSM4577315        1352  1812
  Huang_GSM6605437   976  2730
  Huang_GSM6605438   442   710
  Huang_GSM6605439   304  1604
  Huang_GSM6605440   408  1434
  Huang_GSM7277296   439  1876
  Huang_GSM7277297   225   657
  Huang_GSM7277298   137  1127
  Lai_GSM5572238       5    19
  Lai_GSM5572239      79     6
  Lai_GSM5572240     167    13
  MRC_Endo8625698    114     3
  MRC_Endo8625699    936    90
  MRC_Endo8712024    133    26
  MRC_Endo8712032    215     2
  MRC_Endo8715415    123   119
  MRC_Endo8715416   2252   643
  Shih_MEtissue1      41    53
  Shih_MEtissue2       5    13
  Shih_MEtissue3      68   112
ggplot(as.data.frame(tbl), aes(x = sample, y = Freq, fill = keep)) + 
    geom_col(position = "stack") + theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(x = "", y = "Number of cells")

ggplot(as.data.frame(tbl), aes(x = sample, y = Freq, fill = keep)) + 
    geom_col(position = "fill") + theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(x = "", y = "Fraction of cells")

dat <- dat[, which(dat$keep)]
dim(dat)
[1] 20231 49589
(tbl <- sort(table(dat$sampleCol)))

 MRC_Endo8712032  4861STDY7387183  MRC_Endo8625698   Lai_GSM5572239 
               2                3                3                6 
  Lai_GSM5572240       TB13021061   Lai_GSM5572238  MRC_Endo8712024 
              13               13               19               26 
      TB13021372  MRC_Endo8625699       TB13020851  MRC_Endo8715415 
              53               90              112              119 
      GSM4577313  4861STDY7387182  4861STDY7387181  MRC_Endo8715416 
             172              393              415              643 
Huang_GSM7277297 Huang_GSM6605438       GSM4577306  4861STDY7771123 
             657              710              791              822 
Huang_GSM7277298       GSM4577314 Huang_GSM6605440 Huang_GSM6605439 
            1127             1219             1434             1604 
      GSM4577315 Huang_GSM7277296       GSM4577307       GSM4577311 
            1812             1876             2142             2171 
      GSM4577310       GSM4577312 Huang_GSM6605437  4861STDY7771115 
            2512             2595             2730             3946 
      GSM4577308       GSM4577309 
            4972            14387 
keepsamples <- names(tbl)[tbl > 30]
dat <- dat[, dat$sampleCol %in% keepsamples]
dim(dat)
[1] 20231 49504
tbl <- table(dat[[sid]], dat[[stid]])
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
             cluster_columns = FALSE, cluster_rows = FALSE, 
             row_names_gp = gpar(fontsize = 8),
             name = "Number of\ncells per\nsample",
             rect_gp = gpar(col = "grey", lwd = 1),
             cell_fun = function(j, i, x, y, width, height, fill) {
                 grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
             }))

Recalculate QC metrics for genes based on remaining cells

dat <- scuttle::addPerFeatureQC(dat, assay.type = "counts")
rowData(dat)$intronic_fraction <- 
    rowSums(assay(dat, "unspliced")) / rowSums(assay(dat, "total"))

Most highly expressed genes

scater::plotHighestExprs(dat, n = 25, colour_cells_by = stid, 
                         exprs_values = "counts")

Calculate size factors and logcounts

size_factor_threshold <- 0.1
set.seed(123)
clusters <- scran::quickCluster(
    dat, method = "igraph", min.mean = 0.1,
    use.ranks = FALSE, BSPARAM = BiocSingular::ExactParam(),
    block = dat[[datasetvar]], 
    min.size = min(table(dat[[datasetvar]]))
)
dat <- scran::computeSumFactors(dat, min.mean = 0.1, cluster = clusters)
summary(sizeFactors(dat))
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.06493  0.28916  0.59723  1.00000  1.34411 15.57060 
ggplot(data.frame(libSize = dat$sum,
                  sizeFactor = sizeFactors(dat),
                  group = dat[[blockvar]],
                  stringsAsFactors = FALSE),
       aes(x = libSize, y = sizeFactor, color = group)) + 
    geom_point(alpha = 0.5) + theme_bw() + 
    geom_hline(yintercept = size_factor_threshold, linetype = "dashed") + 
    xlab("Total UMI count") + ylab("Size factor")

dat <- dat[, sizeFactors(dat) > size_factor_threshold]
dim(dat)
[1] 20231 48922
dat <- batchelor::multiBatchNorm(dat, batch = dat[[batchvar]], min.mean = 0.1)
ggplot(data.frame(libSize = dat$sum,
                  sizeFactor = sizeFactors(dat),
                  group = dat[[blockvar]],
                  stringsAsFactors = FALSE),
       aes(x = libSize, y = sizeFactor, color = group)) + 
    geom_point(alpha = 0.5) + theme_bw() + 
    geom_hline(yintercept = size_factor_threshold, linetype = "dashed") + 
    xlab("Total UMI count") + ylab("Size factor")

Cell cycle assignment

## tricycle
set.seed(123)
dat <- project_cycle_space(dat, species = "human", gname.type = "SYMBOL")
dat <- estimate_cycle_position(dat)
dat <- estimate_Schwabe_stage(dat, gname.type = "SYMBOL", species = "human", 
                              batch.v = batchvar, tolerance = 0)
plotReducedDim(dat, "tricycleEmbedding", point_size = 0.5, 
               colour_by = "CCStage")

tbl <- table(dat$StageFactor, dat$CCStage)
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
             cluster_columns = FALSE, cluster_rows = FALSE, 
             row_names_gp = gpar(fontsize = 8),
             name = "Number of\ncells",
             rect_gp = gpar(col = "grey", lwd = 1),
             cell_fun = function(j, i, x, y, width, height, fill) {
                 grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
             }))

## Seurat (using genes from HECA)
s_genes <- read.delim(s_genes)[, 1]
g2m_genes <- read.delim(g2m_genes)[, 1]
seurat <- CreateSeuratObject(counts = assay(dat, "counts"))
seurat <- NormalizeData(seurat, verbose = FALSE)
seurat <- FindVariableFeatures(seurat, selection.method = "vst", verbose = FALSE)
seurat <- ScaleData(seurat, features = rownames(seurat), verbose = FALSE)
seurat <- CellCycleScoring(seurat, s.features = s_genes, g2m.features = g2m_genes, 
                           set.ident = TRUE)
stopifnot(all(rownames(seurat@meta.data) == colnames(dat)))
dat$SeuratCC <- seurat@meta.data$Phase
tbl <- table(dat$StageFactor, dat$SeuratCC)
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
             cluster_columns = FALSE, cluster_rows = FALSE, 
             row_names_gp = gpar(fontsize = 8),
             name = "Number of\ncells",
             rect_gp = gpar(col = "grey", lwd = 1),
             cell_fun = function(j, i, x, y, width, height, fill) {
                 grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
             }))

Find and mark doublets

set.seed(123)
dat <- scDblFinder(dat, samples = blockvar, verbose = FALSE, 
                   BPPARAM = MulticoreParam(workers = 8L))
table(dat$scDblFinder.class, dat[[blockvar]])
         
          4861STDY7387181 4861STDY7387182 4861STDY7771115 4861STDY7771123
  singlet             380             374            3698             764
  doublet              35              19             248              28
         
          GSM4577306 GSM4577307 GSM4577308 GSM4577309 GSM4577310 GSM4577311
  singlet        758       2013       4421      12408       2356       2042
  doublet         31        111        384       1657        136        129
         
          GSM4577312 GSM4577313 GSM4577314 GSM4577315 Huang_GSM6605437
  singlet       2442        156       1159       1723             2539
  doublet        139         16         59         82              191
         
          Huang_GSM6605438 Huang_GSM6605439 Huang_GSM6605440 Huang_GSM7277296
  singlet              665             1525             1361             1780
  doublet               45               79               73               96
         
          Huang_GSM7277297 Huang_GSM7277298 MRC_Endo8625699 MRC_Endo8715415
  singlet              633             1077              83             113
  doublet               24               50               7               5
         
          MRC_Endo8715416 Shih_MEtissue1 Shih_MEtissue3
  singlet             620             49             98
  doublet              23              4             14

Sketching

We find highly variable genes while blocking on the batch, to focus on the biological variance.

set.seed(123)
sketchcells.bybatch <- unlist(lapply(unique(colData(dat)[[batchvar]]), function(b) {
    subd <- dat[, which(colData(dat)[[batchvar]] == b)]
    geneVar <- scran::modelGeneVar(subd)
    subhvgs <- scran::getTopHVGs(geneVar, n = 2000)
    subd <- scater::runPCA(subd, exprs_values = "logcounts", ncomponents = 30, 
                           name = "PCAsub", 
                           subset_row = subhvgs, scale = FALSE)
    idxsub <- geosketch(reducedDim(subd, "PCAsub"), 
                        N = min(ncol(subd), max(0.1 * ncol(subd), 150)), 
                        seed = 123)
    colnames(subd)[idxsub]
}))
Warning in (function (A, nv = 5, nu = nv, maxit = 1000, work = nv + 7, reorth =
TRUE, : You're computing too large a percentage of total singular values, use a
standard svd instead.
dat$sketched.bybatch <- ifelse(colnames(dat) %in% sketchcells.bybatch, TRUE, FALSE)
table(batch = colData(dat)[[batchvar]], sketched.bybatch = dat$sketched.bybatch)
            sketched.bybatch
batch        FALSE  TRUE
  A13          658   150
  A30         4265   473
  E1             0    90
  E3           611   150
  GSM4577306   639   150
  GSM4577307  1912   212
  GSM4577308  4325   480
  GSM4577309 12659  1406
  GSM4577310  2243   249
  GSM4577311  1954   217
  GSM4577312  2323   258
  GSM4577313    22   150
  GSM4577314  1068   150
  GSM4577315  1625   180
  GSM6605437  2457   273
  GSM6605438   560   150
  GSM6605439  1444   160
  GSM6605440  1284   150
  GSM7277296  1689   187
  GSM7277297   507   150
  GSM7277298   977   150
  TB13020851     0   112
  TB13021372     0    53

Find an alternative reduced dimension representation, after sketching

set.seed(123)
## First find a new set of HVGs
dattmp <- dat[, sketchcells.bybatch]
stopifnot(all(rownames(dattmp) == rownames(dat)))
rowData(dat)$scran_geneVar_geosketch <- 
    scran::modelGeneVar(dattmp, block = dattmp[[datasetvar]])
hvgs <- scran::getTopHVGs(rowData(dat)$scran_geneVar_geosketch, n = 3000)
metadata(dat)$hvg.scran.genevar.geosketch <- hvgs

head(
    as.data.frame(
        rowData(dat)$scran_geneVar_geosketch[
            order(rowData(dat)$scran_geneVar_geosketch$bio, 
                  decreasing = TRUE), 
            c("mean", "total", "tech", "bio", "p.value", "FDR")]
    ), 
    25
)
             mean     total      tech       bio       p.value           FDR
PAEP    2.1066383 6.4098906 0.9501136 5.4597770  0.000000e+00  0.000000e+00
CXCL14  0.7296767 2.2408316 0.3970415 1.8437901 4.893356e-310 4.293430e-306
SCGB2A1 1.7646943 2.5129607 0.7697078 1.7432528  6.521461e-62  2.200742e-59
MT1G    1.1138977 2.0830653 0.4075629 1.6755024 6.560409e-163 1.279134e-159
SLPI    2.0791303 2.5438547 0.9892680 1.5545867  2.625436e-45  6.581593e-43
SCGB1D2 0.9692065 1.9679226 0.4401188 1.5278039 7.133695e-120 6.954560e-117
MT2A    1.7135994 2.2420725 0.7701062 1.4719663  4.255256e-62  1.493425e-59
CXCL8   1.2918438 1.9789657 0.6835757 1.2953899  1.351486e-77  6.080993e-75
WFDC2   2.5473116 2.3303857 1.0515430 1.2788427  1.691466e-23  1.890563e-21
S100A6  1.9674850 2.0970420 0.9120803 1.1849616  1.038733e-33  1.804721e-31
VIM     1.5813201 1.9106186 0.7476998 1.1629188  5.670487e-38  1.124438e-35
SOD2    2.1511436 2.0255834 0.9012891 1.1242943  1.635633e-38  3.337452e-36
CRYAB   0.9518985 1.5469181 0.5259863 1.0209318  8.073129e-72  3.079723e-69
MMP7    0.7212142 1.3917734 0.4142810 0.9774924  2.849418e-89  1.612954e-86
HSPA1A  1.4907465 1.7464946 0.7769755 0.9695191  9.968352e-27  1.356005e-24
IGFBP7  1.1095376 1.5747259 0.6082635 0.9664624  2.016097e-39  4.478286e-37
IER3    2.0451952 1.7394000 0.8661273 0.8732727  2.205551e-17  1.751268e-15
TM4SF1  1.5680490 1.6325102 0.7754204 0.8570898  7.623575e-33  1.286332e-30
CAPS    1.0299802 1.3657216 0.5303950 0.8353266  7.882191e-77  3.457917e-74
FOS     2.1156574 1.6169544 0.8387706 0.7781838  1.874416e-12  9.199434e-11
RIMKLB  0.9138842 1.2786274 0.5004791 0.7781483  5.856457e-49  1.581063e-46
GPX3    0.5820514 1.1166279 0.3528173 0.7638106  9.797900e-83  4.775932e-80
CCL20   0.4396294 1.0206143 0.2808581 0.7397562 4.732702e-131 4.885262e-128
CXCL2   0.9461257 1.2510548 0.5248689 0.7261860  3.930548e-38  7.927960e-36
SCGB1D4 0.3711827 0.9053546 0.2023049 0.7030497 1.417861e-136 1.555039e-133
## Next run multibatch PCA on the sketched cells
set.seed(123)
dattmp <- dat[, sketchcells.bybatch]
mbpca <- multiBatchPCA(dattmp, batch = dattmp[[batchvar]], d = 30, 
                       subset.row = metadata(dat)$hvg.scran.genevar.geosketch, 
                       weights = FALSE, preserve.single = TRUE)
projection <- mbpca[[1]]
rotation <- metadata(mbpca)$rotation
centers <- metadata(mbpca)$centers
## Sanity check
tmp <- t(sweep(assay(dattmp[names(centers), ], "logcounts"), MARGIN = 1, 
               STATS = centers, FUN = "-")) %*% rotation
max(abs(tmp - projection))
[1] 1.048051e-13
## Project all cells
pca <- t(sweep(assay(dat[names(centers), ], "logcounts"), MARGIN = 1, 
               STATS = centers, FUN = "-")) %*% rotation
Warning in asMethod(object): sparse->dense coercion: allocating vector of size
1.1 GiB
## Perform batch correction in the PCA space
stopifnot(all(rownames(pca) == colnames(dat)))
reducedDim(dat, "multibatchPCA") <- as.matrix(pca)
mnn <- reducedMNN(as.matrix(pca), batch = dat[[batchvar]], k = 20, 
                  BPPARAM = MulticoreParam(workers = 8L))
stopifnot(all(rownames(mnn) == colnames(dat)))
reducedDim(dat, "fastMNNcorr_sketch") <- mnn$corrected

Dimension reduction

Apply tSNE and UMAP to sketched cells and project the others.

## TSNE
set.seed(123)
tsne <- snifter::fitsne(
    reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"), 
    n_jobs = 8L, random_state = 123L,
    perplexity = 30, n_components = 2L, 
    pca = FALSE)
proj <- snifter::project(
    x = tsne, 
    new = reducedDim(dat, "fastMNNcorr_sketch"), 
    old = reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"))
rownames(proj) <- colnames(dat)
reducedDim(dat, "TSNE_fastMNNcorr_sketch") <- proj

## UMAP
set.seed(123)
umap <- uwot::umap(
    X = reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"), 
    n_components = 2, n_neighbors = 50,
    pca = NULL, scale = FALSE, 
    ret_model = TRUE, n_threads = 8L)
proj <- uwot::umap_transform(
    X = reducedDim(dat, "fastMNNcorr_sketch"), 
    model = umap, 
    n_threads = 8L)
rownames(proj) <- colnames(dat)
reducedDim(dat, "UMAP_fastMNNcorr_sketch") <- proj

Shuffle cells (to avoid plotting in layers by sample later)

set.seed(123)
dat <- dat[, sample(seq_len(ncol(dat)), ncol(dat), replace = FALSE)]

Clustering

## Helper function
## method should be "louvain" or "leiden"
clusterfcn <- function(sce, dimred = "fastMNNcorr", method = "leiden",
                       resolutions = c(0.05, 0.1, 0.2, 0.4, 0.6, 1.0), 
                       sketched_cells = NULL) {
    if (is.null(sketched_cells)) {
        for (r in resolutions) {
            set.seed(123)
            mat <- reducedDim(sce, dimred)
            clst <- bluster::clusterRows(
                mat, 
                BLUSPARAM = bluster::SNNGraphParam(
                    k = 10, type = "rank", 
                    num.threads = 1L, 
                    cluster.fun = method, 
                    cluster.args = list(resolution = r)))
            colData(sce)[[paste0(method, "_res", r)]] <- as.character(clst)
        }
    } else {
        train <- reducedDim(sce[, sketched_cells], dimred)
        test <- reducedDim(sce[, setdiff(colnames(sce), sketched_cells)], dimred)
        for (r in resolutions) {
            set.seed(123)
            df <- data.frame(cell = colnames(sce), cluster = NA_character_)
            clst <- as.character(bluster::clusterRows(
                train,  
                BLUSPARAM = bluster::SNNGraphParam(
                    k = 10, type = "rank", 
                    num.threads = 1L, 
                    cluster.fun = method, 
                    cluster.args = list(resolution = r))))
            preds <- class::knn(train = train, test = test, 
                                cl = clst, k = 1) |> as.character()
            df$cluster[match(rownames(train), df$cell)] <- clst
            df$cluster[match(rownames(test), df$cell)] <- preds
            stopifnot(colnames(sce) == df$cell)
            colData(sce)[[paste0(method, "_res", r)]] <- df$cluster
        }
    }
    
    sce
}

dat <- clusterfcn(dat, dimred = "fastMNNcorr_sketch", method = "leiden",
                  resolutions = 0.4, 
                  sketched_cells = sketchcells.bybatch)

Annotate clusters

clust_annot <- stack(
    list("pre-ciliated" = 2, 
         "ciliated" = c(1, 3, 8, 30), 
         "secretory late" = 41, 
         "secretory mid" = c(22, 25, 35, 34, 40, 38, 21, 18, 19, 23), 
         "secretory early" = c(26, 31, 36, 37), 
         "luminal late" = c(45, 28, 32, 46, 20, 33), 
         "transcriptionally active" = c(15, 10, 39), 
         "luminal early" = 14, 
         "MUC5B+" = 42, 
         "KRT5+" = 43, 
         "remodeling" = c(16, 24, 13), 
         "cycling" = c(4, 29, 6, 11, 12, 9), 
         "hormone responsive early" = c(5, 7, 47), 
         "hormone responsive late" = c(17, 44, 27))
)
dat$leiden_res0.4_annot <- clust_annot$ind[match(dat$leiden_res0.4, 
                                                 as.character(clust_annot$values))]
table(dat$leiden_res0.4_annot)

            pre-ciliated                 ciliated           secretory late 
                     178                     3595                     1265 
           secretory mid          secretory early             luminal late 
                   11291                     4299                    10567 
transcriptionally active            luminal early                   MUC5B+ 
                    1402                     1274                      152 
                   KRT5+               remodeling                  cycling 
                      81                      734                     6035 
hormone responsive early  hormone responsive late 
                    4874                     3175 
leiden_res0.4_annot_cols <- c(
    "pre-ciliated" = "lightskyblue", 
    "ciliated" = "royalblue", 
    "secretory early" = "#C8AEE2", 
    "secretory mid" = "#7D4DBA", 
    "secretory late" = "#3A1260", 
    "luminal early" = "gold1", 
    "luminal late" = "darkorange", 
    "transcriptionally active" = "turquoise4", 
    "MUC5B+" = "salmon", 
    "KRT5+" = "firebrick", 
    "remodeling" = "#4CAF50", 
    "cycling" = "mediumaquamarine", 
    "hormone responsive early" = "hotpink2", 
    "hormone responsive late" = "hotpink4" 
) 

Dot plots of clusters and marker genes of interest

col = circlize::colorRamp2(
    breaks = seq(0, 7.5, length.out = 64),
    colors = colorRampPalette(c("grey90", "purple"))(64))

markers <- intersect(
    c("MKI67", "MMP7", "STMN1", "PKM", "ALDH1A1", "VIM", "LGR5", "LEFTY1",
      "IL6", "ENPP3", "WNT7A", "IL32", "SAA1", "SAA2", "CLU", "MUC5B", "LCN2", 
      "LY6E", "S100A6", "PLAUR", "PLAU", "SFN", "TNFRSF12A", "IL23A", "KRT17",
      "KRT5", "ANXA1", "CXCL8", "SCGB2A1", "SBGB2A2", "SCGB1D2", "IL6ST", 
      "PAEP", "FGF7", "PGR", "ESR1", "CCNO", "FOXJ1", "PIFO", "RSPH1", "DKK1"),
    rownames(dat))
plotDots(dat, features = markers,
         group = "leiden_res0.4_annot", 
         swap_rownames = "gene_name", zlim = c(0, 7.5),
         colour = col(seq(0, 7.5, length.out = 64))) + 
    scale_size(limits=c(0.2, 1)) + 
    labs(x = "") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Marker genes from HECA paper (https://www.nature.com/articles/s41588-024-01873-w, Fig 2a).

markers <- intersect(
    c("PAX8", "EPCAM", "FOXA2", "MMP7", "SOX9", "AXIN2", "TRH", "ALDH1A1",
      "CDH2", "KLK11", "SLC7A11", "DKK1", "PHLDA1", "KMO", "IHH", "EMID1", 
      "HPRT1", "SUFU", "OPRK1", "CBR3", "S100P", "SCGB2A2", "ABCG1", "PAEP", 
      "DPP4", "GPX3", "PTPRR", "FGF7", "FXYD2", "IL32", "WNT7A", "LGR5", 
      "MMP7", "TNF", "VTCN1", "CLDN22", "SULT1E1", "LEFTY1", "PTGS1", "IL6",
      "CCNO", "CDC20B", "MUC12", "PIFO", "FOXJ1", "TP73", "BPIFB1", "TFF3", 
      "MUC5B", "SAA1", "TP63", "SNCG", "KRT5", "PGR", "AR", "ESR1"), 
    rownames(dat))
plotDots(dat, features = markers,
         group = "leiden_res0.4_annot", 
         swap_rownames = "gene_name", zlim = c(0, 7.5),
         colour = col(seq(0, 7.5, length.out = 64))) + 
    scale_size(limits=c(0.2, 1)) + 
    labs(x = "") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Plot dimension reductions

stage_colors <- c( 
    "early-secretory" = "#F4EBFA",
    "early-mid-secretory" = "#C8AEE2",
    "proliferative" = "hotpink2",
    "mid-secretory" = "#7D4DBA",
    "late-secretory" = "#3A1260",
    "menstrual" = "#A30B2E"
)

seurat_colors <- c(
    "S" = "gold",
    "G1" = "grey80",
    "G2M" = "brown"
)

plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
               other_fields = "HECA_Celltype", point_size = 0.5) + 
    geom_point(aes(color = HECA_Celltype), size = 0.5) + 
    guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
    theme_void()

for (vrb in c("RCA_Celltype", "RCA_Location", "RCA_BiopsyType", 
              "matchDataset", "StageCoarse", "donorCol", 
              "scDblFinder.class", "leiden_res0.4")) {
    print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
                         colour_by = vrb, point_size = 0.5) + 
              guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
              theme_void())
}

print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
                     colour_by = "leiden_res0.4_annot", point_size = 0.5) + 
          scale_color_manual(values = leiden_res0.4_annot_cols, name = "") + 
          guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
          theme_void())
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
                     colour_by = "Stage", point_size = 0.5) + 
          scale_color_manual(values = stage_colors, name = "") + 
          guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
          theme_void())
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
                     colour_by = "SeuratCC", point_size = 0.5) + 
          scale_color_manual(values = seurat_colors, name = "") + 
          guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
          theme_void())
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

for (vrb in c("subsets_MT_percent", "detected", "intronic_fraction")) {
    print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch", 
                         colour_by = vrb, point_size = 0.5) + 
              theme_void())
}

Plot cluster abundance over the cycle

col <- "leiden_res0.4_annot"

df <- data.frame(colData(dat)[, c("StageFactor", col)]) |>
    group_by(StageFactor, .data[[col]]) |>
    tally() |>
    group_by(StageFactor) |>
    mutate(perc = n / sum(n) * 100) |>
    group_by(.data[[col]]) |>
    mutate(scaledperc = perc / sum(perc)) |>
    ungroup() |>
    dplyr::filter(!is.na(StageFactor))
ggplot(df, aes(x = StageFactor, y = perc, color = .data[[col]])) + 
    geom_point() + 
    geom_line(aes(x = as.numeric(StageFactor)), linewidth = 1) + 
    theme_cowplot() + 
    labs(x = "", y = "Relative abundance (%)",
         title = col) + 
    theme(legend.position = "bottom") + 
    scale_color_manual(values = leiden_res0.4_annot_cols, name = "")

## One panel per cluster
ggplot(df, aes(x = StageFactor, y = perc, color = .data[[col]])) + 
    geom_point() + 
    geom_line(aes(x = as.numeric(StageFactor)), linewidth = 1) + 
    theme_cowplot() + 
    labs(x = "", y = "Relative abundance (%)",
         title = col) + 
    facet_wrap(~ .data[[col]], scales = "free_y") + 
    scale_color_manual(values = leiden_res0.4_annot_cols, name = "") + 
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          strip.text = element_text(size = 10))

Find marker genes

First, we compare all clusters in a pairwise manner. The comparison is only performed for genes with a non-zero count in at least 0.05% of the cells (corresponding to 25 of the 48922 cells in the data set). These gene IDs are also written to a file, for potential use as the ‘universe’ in over-representation analyses.

frac_detected <- rowMeans(assay(dat, "counts") > 0)
genes_to_test <- rownames(dat)[frac_detected >= (0.05 / 100)]
write.table(genes_to_test, file = "epithelial_cell_atlas_gene_universe.txt", 
            row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")

clc <- "leiden_res0.4_annot"
ptts <- pairwiseTTests(dat, groups = dat[[clc]], 
                       block = dat[[blockvar]], direction = "up", 
                       subset.row = genes_to_test)
markers <- combineMarkers(ptts$statistics, ptts$pairs, pval.type = "all")

## Write results to files
ttestdir <- paste0("pairwise_ttests_", clc)
if (!file.exists(ttestdir)) {
    dir.create(ttestdir, recursive = TRUE)
}
for (i in seq_len(nrow(ptts$pairs))) {
    df <- cbind(gene = rownames(ptts$statistics[[i]]), 
                ptts$statistics[[i]])
    df <- df[order(df$p.value, -df$logFC), ]
    filename <- paste0(clc, "_cluster_", ptts$pairs$first[i],
                       "_vs_", ptts$pairs$second[i], ".txt")
    write.table(df, file = file.path(ttestdir, filename), quote = FALSE,
                sep = "\t", row.names = FALSE, col.names = TRUE)
}

markerdir <- paste0("marker_genes_all_", clc)
if (!file.exists(markerdir)) {
    dir.create(markerdir, recursive = TRUE)
}
for (i in seq_along(markers)) {
    df <- cbind(gene = rownames(markers[[i]]), markers[[i]])
    filename <- paste0(clc, "_cluster_", names(markers)[i], "_markers.txt")
    write.table(df, file = file.path(markerdir, filename), quote = FALSE,
                sep = "\t", row.names = FALSE, col.names = TRUE)
}

Save object

saveRDS(dat, file = "invivo_epithelial_cell_atlas.rds")

Session info

sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Rocky Linux 8.10 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Zurich
tzcode source: system (glibc)

attached base packages:
 [1] parallel  grid      stats4    stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] DT_0.33                     igraph_2.1.4               
 [3] BiocParallel_1.40.0         bluster_1.16.0             
 [5] org.Hs.eg.db_3.20.0         AnnotationDbi_1.68.0       
 [7] tricycle_1.14.0             sketchR_1.2.0              
 [9] Seurat_5.2.1                SeuratObject_5.0.2         
[11] sp_2.2-0                    scDblFinder_1.20.2         
[13] circlize_0.4.16             ComplexHeatmap_2.22.0      
[15] cowplot_1.1.3               batchelor_1.22.0           
[17] msigdbr_7.5.1               tidyr_1.3.1                
[19] dplyr_1.1.4                 scran_1.34.0               
[21] scater_1.34.0               ggplot2_3.5.1              
[23] scuttle_1.16.0              SingleCellExperiment_1.28.1
[25] SummarizedExperiment_1.36.0 Biobase_2.66.0             
[27] GenomicRanges_1.58.0        GenomeInfoDb_1.42.3        
[29] IRanges_2.40.1              S4Vectors_0.44.0           
[31] BiocGenerics_0.52.0         MatrixGenerics_1.18.1      
[33] matrixStats_1.5.0           fishpond_2.12.0            

loaded via a namespace (and not attached):
  [1] spatstat.sparse_3.1-0     bitops_1.0-9             
  [3] httr_1.4.7                RColorBrewer_1.1-3       
  [5] doParallel_1.0.17         tools_4.4.2              
  [7] sctransform_0.4.1         R6_2.6.1                 
  [9] ResidualMatrix_1.16.0     lazyeval_0.2.2           
 [11] uwot_0.2.3                GetoptLong_1.0.5         
 [13] withr_3.0.2               gridExtra_2.3            
 [15] progressr_0.15.1          cli_3.6.4                
 [17] Cairo_1.6-2               spatstat.explore_3.3-4   
 [19] fastDummies_1.7.5         labeling_0.4.3           
 [21] sass_0.4.9                mvtnorm_1.3-3            
 [23] spatstat.data_3.1-4       ggridges_0.5.6           
 [25] pbapply_1.7-2             Rsamtools_2.22.0         
 [27] dichromat_2.0-0.1         parallelly_1.42.0        
 [29] limma_3.62.2              RSQLite_2.3.9            
 [31] circular_0.5-1            generics_0.1.3           
 [33] shape_1.4.6.1             BiocIO_1.16.0            
 [35] crosstalk_1.2.1           gtools_3.9.5             
 [37] ica_1.0-3                 spatstat.random_3.3-2    
 [39] Matrix_1.7-1              ggbeeswarm_0.7.2         
 [41] abind_1.4-8               lifecycle_1.0.4          
 [43] yaml_2.3.10               edgeR_4.4.2              
 [45] SparseArray_1.6.2         Rtsne_0.17               
 [47] blob_1.2.4                promises_1.3.2           
 [49] dqrng_0.4.1               crayon_1.5.3             
 [51] dir.expiry_1.14.0         miniUI_0.1.1.1           
 [53] lattice_0.22-6            beachmat_2.22.0          
 [55] KEGGREST_1.46.0           magick_2.8.5             
 [57] pillar_1.10.1             knitr_1.49               
 [59] metapod_1.14.0            boot_1.3-31              
 [61] rjson_0.2.23              xgboost_1.7.8.1          
 [63] future.apply_1.11.3       codetools_0.2-20         
 [65] glue_1.8.0                spatstat.univar_3.1-1    
 [67] data.table_1.17.0         vctrs_0.6.5              
 [69] png_0.1-8                 spam_2.11-1              
 [71] gtable_0.3.6              assertthat_0.2.1         
 [73] cachem_1.1.0              xfun_0.51                
 [75] S4Arrays_1.6.0            mime_0.12                
 [77] survival_3.7-0            iterators_1.0.14         
 [79] statmod_1.5.0             fitdistrplus_1.2-2       
 [81] ROCR_1.0-11               nlme_3.1-166             
 [83] bit64_4.6.0-1             filelock_1.0.3           
 [85] RcppAnnoy_0.0.22          bslib_0.9.0              
 [87] irlba_2.3.5.1             vipor_0.4.7              
 [89] KernSmooth_2.23-24        colorspace_2.1-1         
 [91] DBI_1.2.3                 tidyselect_1.2.1         
 [93] bit_4.5.0.1               compiler_4.4.2           
 [95] curl_6.2.1                BiocNeighbors_2.0.1      
 [97] basilisk.utils_1.18.0     DelayedArray_0.32.0      
 [99] plotly_4.10.4             rtracklayer_1.66.0       
[101] scales_1.3.0.9000         lmtest_0.9-40            
[103] stringr_1.5.1             digest_0.6.37            
[105] goftest_1.2-3             spatstat.utils_3.1-2     
[107] rmarkdown_2.29            basilisk_1.18.0          
[109] XVector_0.46.0            htmltools_0.5.8.1        
[111] pkgconfig_2.0.3           sparseMatrixStats_1.18.0 
[113] fastmap_1.2.0             rlang_1.1.5              
[115] GlobalOptions_0.1.2       htmlwidgets_1.6.4        
[117] UCSC.utils_1.2.0          shiny_1.10.0             
[119] DelayedMatrixStats_1.28.1 jquerylib_0.1.4          
[121] farver_2.1.2              zoo_1.8-13               
[123] jsonlite_1.9.0            BiocSingular_1.22.0      
[125] RCurl_1.98-1.16           magrittr_2.0.3           
[127] GenomeInfoDbData_1.2.13   dotCall64_1.2            
[129] patchwork_1.3.0           Rcpp_1.0.14              
[131] ggnewscale_0.5.1          babelgene_22.9           
[133] viridis_0.6.5             reticulate_1.41.0        
[135] stringi_1.8.4             zlibbioc_1.52.0          
[137] MASS_7.3-61               plyr_1.8.9               
[139] listenv_0.9.1             ggrepel_0.9.6            
[141] deldir_2.0-4              Biostrings_2.74.1        
[143] splines_4.4.2             tensor_1.5               
[145] locfit_1.5-9.11           spatstat.geom_3.3-5      
[147] RcppHNSW_0.6.0            reshape2_1.4.4           
[149] ScaledMatrix_1.14.0       XML_3.99-0.18            
[151] evaluate_1.0.3            snifter_1.16.0           
[153] foreach_1.5.2             httpuv_1.6.15            
[155] RANN_2.6.2                purrr_1.0.4              
[157] polyclip_1.10-7           future_1.34.0            
[159] clue_0.3-66               scattermore_1.2          
[161] rsvd_1.0.5                xtable_1.8-4             
[163] restfulr_0.0.15           svMisc_1.4.3             
[165] RSpectra_0.16-2           later_1.4.1              
[167] class_7.3-22              viridisLite_0.4.2        
[169] tibble_3.2.1              memoise_2.0.1            
[171] beeswarm_0.4.0            GenomicAlignments_1.42.0 
[173] cluster_2.1.6             globals_0.16.3