suppressPackageStartupMessages({
library(fishpond)
library(SummarizedExperiment)
library(SingleCellExperiment)
library(scuttle)
library(scater)
library(scran)
library(dplyr)
library(tidyr)
library(ggplot2)
library(msigdbr)
library(batchelor)
library(cowplot)
library(ComplexHeatmap)
library(circlize)
library(scDblFinder)
library(Seurat)
library(sketchR)
library(tricycle)
library(org.Hs.eg.db)
library(parallel)
library(bluster)
library(BiocParallel)
library(igraph)
library(DT)
})Generating an epithelial endometrial atlas
Load packages
Define parameters and reference files
samples <- c("4861STDY7387181", "4861STDY7387182", "4861STDY7387183",
"4861STDY7771115", "4861STDY7771123", "MRC_Endo8625698",
"MRC_Endo8625699", "MRC_Endo8712024", "MRC_Endo8712032",
"MRC_Endo8715415", "MRC_Endo8715416", "GSM4577306", "GSM4577307",
"GSM4577308", "GSM4577309", "GSM4577310", "GSM4577311",
"GSM4577312", "GSM4577313", "GSM4577314", "GSM4577315",
"Shih_MEtissue1", "Shih_MEtissue2", "Shih_MEtissue3",
"Shih_MEtissue4", "Shih_MEtissue5", "Shih_MEtissue6",
"Shih_MEtissue7", "Huang_GSM6605437", "Huang_GSM6605438",
"Huang_GSM6605440", "Huang_GSM6605439", "Huang_GSM7277296",
"Huang_GSM7277297", "Huang_GSM7277298", "Lai_GSM5572238",
"Lai_GSM5572239", "Lai_GSM5572240")
blockvar <- "sample" ## used as blocking factor in doublet annotation
batchvar <- "donorCol" ## used for batch correction
datasetvar <- "matchDataset" ## used as blocking factor in HVG annotation
## Sample annotation
sampleannot <- "invivo_atlas_samples.txt"
## Gene annotation (derived from GENCODE v38 gtf file)
t2g <- "reference/gencode.v38.annotation.tx2gene_full.rds"
## Reproductive cell atlas data (downloaded from
## https://www.reproductivecellatlas.org/ and converted to SingleCellExperiment
## format)
rca <- "data/reproductivecellatlas/2021/endometrium_all.rds"
heca <- "data/reproductivecellatlas/2023/endometriumAtlasV2_cells_with_counts.rds"
## Cell cycle marker genes
## https://github.com/ventolab/HECA-Human-Endometrial-Cell-Atlas/tree/main/utils
s_genes <- "data/reproductivecellatlas/2023/S_genes_HECA.tsv"
g2m_genes <- "data/reproductivecellatlas/2023/G2M_genes_HECA.tsv"
## Shih et al cell annotations (obtained from
## https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE203191)
shih_cellannot <- "data/GSE203191/GSE203191_Shih_endo_meta_frame.tsv"Assemble gene annotation
geneannot <- readRDS(t2g) |>
dplyr::select(gene_id, seqnames, gene_type, gene_name) |>
dplyr::rename(chromosome = seqnames) |>
dplyr::distinct()
## Annotate ribosomal protein genes
## GO term "structural constituent of ribosome" (GO:0003735)
golist <- as.list(org.Hs.egGO2ALLEGS)
ribo_entrez <- unique(golist[["GO:0003735"]])
length(ribo_entrez)[1] 175
tmp <- select(org.Hs.eg.db, keys = ribo_entrez, columns = "ENSEMBL",
keytype = "ENTREZID")'select()' returned 1:many mapping between keys and columns
ribo_ensembl <- unique(na.omit(tmp[, "ENSEMBL"]))
length(ribo_ensembl)[1] 195
summary(ribo_ensembl %in% sub("\\.[0-9]+$", "", geneannot$gene_id)) Mode FALSE TRUE
logical 33 162
geneannot$is_ribosomal_protein <-
sub("\\.[0-9]+$", "", geneannot$gene_id) %in% ribo_ensembl
geneannot$gene_type_ribo <- ifelse(
geneannot$gene_type == "protein_coding",
ifelse(geneannot$is_ribosomal_protein, "protein_coding_ribosome",
geneannot$gene_type),
geneannot$gene_type)Read data
## Get sample IDs
tbl <- read.delim(sampleannot) |>
dplyr::filter(SampleName %in% samples)
DT::datatable(tbl, options = list(scrollX = TRUE, pageLength = 50))sns <- unique(tbl$SampleName)
names(sns) <- sns
## Read alevin-fry data and combine
dat <- do.call(cbind, lapply(sns, function(s) {
x <- fishpond::loadFry(
fryDir = file.path("data", "alevin_fry_splici", s),
outputFormat = list(counts = c("S", "A"), spliced = c("S", "A"),
unspliced = c("U"), total = c("U", "S", "A")))
rowData(x)$gene_name <-
geneannot$gene_name[match(rowData(x)$gene_ids, geneannot$gene_id)]
rowData(x)$chromosome <-
geneannot$chromosome[match(rowData(x)$gene_ids, geneannot$gene_id)]
rowData(x)$gene_type <-
geneannot$gene_type[match(rowData(x)$gene_ids, geneannot$gene_id)]
rowData(x)$gene_type_ribo <-
geneannot$gene_type_ribo[match(rowData(x)$gene_ids, geneannot$gene_id)]
## Keep only cells with at least 1,000 detected genes
x <- x[, colSums(assay(x, "total") > 0) > 1000]
## Add sample ID to colnames
x$sample <- s
colnames(x) <- paste0(s, "_", colnames(x))
x
}))
datclass: SingleCellExperiment
dim: 60649 274348
metadata(0):
assays(4): counts spliced unspliced total
rownames(60649): ENSG00000223972.5 ENSG00000243485.5 ...
ENSG00000210194.1 ENSG00000210196.2
rowData names(5): gene_ids gene_name chromosome gene_type
gene_type_ribo
colnames(274348): 4861STDY7387181_AGAGTGGGTCTAGCGC
4861STDY7387181_TAAGAGATCTGAGTGT ...
Huang_GSM7277298_CTATCCGTCTTTCGAT Huang_GSM7277298_GGGTTTACAGAACTTC
colData names(2): barcodes sample
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
## Set rownames to gene names where possible
rownames(dat) <- scater::uniquifyFeatureNames(
ID = rownames(dat),
names = rowData(dat)$gene_name
)Add sample annotations
## Extract barcodes corresponding to control samples in Shih data and remove
## all other barcodes from the Shih samples
cb <- read.delim(shih_cellannot) |>
dplyr::filter(pheno == "Control") |>
dplyr::mutate(barcode = gsub("-[0-9]$", "", barcode)) |>
dplyr::mutate(sample_cb = paste("Shih", run, barcode, sep = "_")) |>
dplyr::pull(sample_cb)
to_remove <- which(grepl("Shih", dat$sample) &
!paste0(dat$sample, "_", dat$barcodes) %in% cb)
dat <- dat[, -to_remove]
dim(dat)[1] 60649 253579
## Add 'cell' column
dat$cell <- colnames(dat)
## Any other provided annotations
tbl2 <- tbl |>
dplyr::filter(RunID != "Shih")
if (nrow(tbl2) > 0 && (length(tbl2$SampleName) == length(unique(tbl2$SampleName)))) {
for (cn in colnames(tbl2)) {
if (!all(is.na(tbl2[[cn]])) && !(cn %in% colnames(colData(dat)))) {
if (cn == "Day") {
colData(dat)[[cn]] <- paste0(
"D", tbl2[[cn]][match(dat$sample, tbl2$SampleName)])
} else if (cn == "RunID") {
colData(dat)[[cn]] <- sub("_[0-9]+$", "",
tbl2[[cn]][match(dat$sample,
tbl2$SampleName)])
} else {
colData(dat)[[cn]] <- tbl2[[cn]][match(dat$sample, tbl2$SampleName)]
}
}
}
}
## Harmonize annotation values
dat$Stage <- gsub(" stage", "", dat$Stage)
## Add information for the Shih data set
shih_annot <- read.delim(shih_cellannot) |>
dplyr::mutate(cell = paste0("Shih_", run, "_", gsub("-[0-9]$", "", barcode)))
if (any(dat$cell %in% shih_annot$cell)) {
for (cn in c("clusterID", "pheno", "Phase", "subjectID", "NKsubclusterID",
"StromalSubclusterID")) {
colData(dat)[[paste0("Shih_", cn)]] <-
shih_annot[[cn]][match(dat$cell, shih_annot$cell)]
}
}
dat$RunID[grep("Shih", dat$sample)] <- "Shih"
dat$Stage[grep("Shih", dat$sample)] <- "menstrual"
dat$StageFactor <- factor(dat$Stage,
levels = c("menstrual", "proliferative",
"early-secretory", "early-mid-secretory",
"mid-secretory", "late-secretory"))
dat$StageCoarse <- sub("early-|early-mid-|mid-|late-", "", dat$Stage)
## Create a 'donorCol' column
dat$donorCol <- NA_character_
if ("DonorID" %in% colnames(colData(dat))) {
idx <- which(!is.na(dat[["DonorID"]]))
dat$donorCol[idx] <- dat[["DonorID"]][idx]
}
if ("Shih_subjectID" %in% colnames(colData(dat))) {
idx <- which(!is.na(dat[["Shih_subjectID"]]))
dat$donorCol[idx] <- dat[["Shih_subjectID"]][idx]
}
## Create a 'sampleCol' column (either sample or Shih_subjectID)
dat$sampleCol <- NA_character_
if ("sample" %in% colnames(colData(dat))) {
idx <- which(!is.na(dat[["sample"]]))
dat$sampleCol[idx] <- dat[["sample"]][idx]
}
if ("Shih_subjectID" %in% colnames(colData(dat))) {
idx <- which(!is.na(dat[["Shih_subjectID"]]))
dat$sampleCol[idx] <- dat[["Shih_subjectID"]][idx]
}
## Generate columns to match to HECA
dat$matchDataset <- ifelse(dat$RunID %in% c("Wang", "Shih", "Lai", "Huang"),
dat$RunID, "GarciaAlonso")
dat$matchCell <- sub("^Huang_|^Lai_|^Shih_", "", dat$cell)
dat$matchId <- paste0(dat$matchCell, "-", dat$matchDataset)
## Add annotations from the reproductive cell atlas (2021)
rca <- readRDS(rca)
table(dat$matchDataset, colnames(dat) %in% colnames(rca), useNA = "ifany")
FALSE TRUE
GarciaAlonso 28396 40337
Huang 63190 0
Lai 18499 0
Shih 3110 0
Wang 47223 52824
dat$RCA_CellCyclePhase <- rca$`CellCycle Phase`[match(colnames(dat), colnames(rca))]
dat$RCA_Celltype <- rca$`Cell type`[match(colnames(dat), colnames(rca))]
dat$RCA_percentmito <- rca$percent_mito[match(colnames(dat), colnames(rca))]
dat$RCA_ngenes <- rca$n_genes[match(colnames(dat), colnames(rca))]
dat$RCA_BiopsyType <- rca$BiopsyType[match(colnames(dat), colnames(rca))]
dat$RCA_Location <- rca$Location[match(colnames(dat), colnames(rca))]
dat$RCA_BroadCelltype <- rca$`Broad cell type`[match(colnames(dat), colnames(rca))]
## Add annotations from HECA (2023)
heca <- readRDS(heca)
table(dat$matchDataset, dat$matchId %in% colnames(heca), useNA = "ifany")
FALSE TRUE
GarciaAlonso 37041 31692
Huang 34420 28770
Lai 6227 12272
Shih 3110 0
Wang 46329 53718
dat$HECA_CellCyclePhase <- as.character(heca$phase[match(dat$matchId, colnames(heca))])
dat$HECA_Celltype <- as.character(heca$celltype[match(dat$matchId, colnames(heca))])
dat$HECA_ngenes <- heca$n_genes[match(dat$matchId, colnames(heca))]
dat$HECA_percentmito <- heca$percent_mito[match(dat$matchId, colnames(heca))]
dat$HECA_ScrubletScore <- heca$scrublet_score[match(dat$matchId, colnames(heca))]
dat$HECA_Stage <- as.character(heca$Stage[match(dat$matchId, colnames(heca))])
dat$HECA_Lineage <- as.character(heca$lineage[match(dat$matchId, colnames(heca))])
dat$HECA_Label <- as.character(heca$label_long[match(dat$matchId, colnames(heca))])
## Filter to only keep cells that have a value for the batch variable
(dat <- dat[, !is.na(dat[[batchvar]])])class: SingleCellExperiment
dim: 60649 253579
metadata(0):
assays(4): counts spliced unspliced total
rownames(60649): DDX11L1 MIR1302-2HG ... MT-TE MT-TP
rowData names(5): gene_ids gene_name chromosome gene_type
gene_type_ribo
colnames(253579): 4861STDY7387181_AGAGTGGGTCTAGCGC
4861STDY7387181_TAAGAGATCTGAGTGT ...
Huang_GSM7277298_CTATCCGTCTTTCGAT Huang_GSM7277298_GGGTTTACAGAACTTC
colData names(49): barcodes sample ... HECA_Lineage HECA_Label
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
## Sample annotation summary
DT::datatable(as.data.frame(colData(dat)) |>
dplyr::select(sample, SampleType, DonorID, SampleName, RunID,
ReadLength1, ReadLength2, Tissue, Stage,
Digestion, Protocol, Source, Sex,
TimeInMenstrualCycle, DiseaseState,
Fecundity, Phenotype, DevStage,
Shih_pheno, Shih_subjectID,
StageFactor, StageCoarse, donorCol, sampleCol,
matchDataset) |>
dplyr::distinct(),
rownames = FALSE,
options = list(scrollX = TRUE, pageLength = 50))Extract only epithelial cells for further analysis
We perform a basic pre-analysis with the aim to cluster the cells and extract the ones that are likely of epithelial origin, for further processing.
set.seed(123)
## Normalize
dat0 <- scuttle::computeLibraryFactors(dat)
dat0 <- batchelor::multiBatchNorm(dat0, batch = dat0[[batchvar]], min.mean = 0.1)
## Find HVGs
scran_geneVar <- scran::modelGeneVar(dat0, block = dat0[[datasetvar]])
hvg.scran.genevar <- scran::getTopHVGs(scran_geneVar, n = 3000)
## Batch correction and dimension reduction
mnn.out <- fastMNN(dat0, batch = dat0[[batchvar]], d = 50, k = 20,
subset.row = hvg.scran.genevar)
stopifnot(all(colnames(mnn.out) == colnames(dat0)))
reducedDim(dat0, "fastMNNcorr") <- reducedDim(mnn.out, "corrected")
dat0 <- scater::runUMAP(dat0, dimred = "fastMNNcorr",
name = "UMAP_fastMNNcorr", n_neighbors = 50)
plotReducedDim(dat0, dimred = "UMAP_fastMNNcorr", colour_by = "HECA_Lineage",
point_size = 0.5) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))## Cluster the UMAP and select the epithelial cluster
grph <- bluster::makeSNNGraph(reducedDim(dat0, "UMAP_fastMNNcorr"),
k = 20, type = "rank",
BPPARAM = MulticoreParam(workers = 8L))
clst <- igraph::cluster_louvain(grph, resolution = 0.001)
dat0$louvain <- as.character(clst$membership)
plotReducedDim(dat0, dimred = "UMAP_fastMNNcorr", colour_by = "louvain",
point_size = 0.5) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))tbl <- table(dat0$louvain, dat0$HECA_Lineage)
tbl <- sweep(tbl, MARGIN = 1, STATS = rowSums(tbl), FUN = "/")
tbl
Endothelial Epithelial Immune Mesenchymal
1 9.593016e-05 9.988488e-01 9.593016e-04 9.593016e-05
2 4.684280e-05 3.278996e-04 2.451440e-03 9.971738e-01
3 9.988607e-01 4.882812e-04 1.627604e-04 4.882812e-04
4 0.000000e+00 0.000000e+00 3.020236e-04 9.996980e-01
5 0.000000e+00 0.000000e+00 5.773672e-04 9.994226e-01
6 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
7 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
## epithelial
(keepclust_epithelial <- rownames(tbl)[tbl[, "Epithelial"] > 0.9])[1] "1"
dat0$keep_epithelial <- dat0$louvain %in% keepclust_epithelial
table(dat0$keep_epithelial)
FALSE TRUE
168225 85354
scater::plotReducedDim(dat0, dimred = "UMAP_fastMNNcorr",
colour_by = "keep_epithelial", point_size = 0.5) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))## Subset the original data set to the epithelial cells, and delete the
## temporary dataset used above
dat0 <- dat0[, which(dat0$keep_epithelial)]
dim(dat0)[1] 60649 85354
stopifnot(colnames(dat0) %in% colnames(dat))
dat <- dat[, colnames(dat0)]
rm(dat0)
datclass: SingleCellExperiment
dim: 60649 85354
metadata(0):
assays(4): counts spliced unspliced total
rownames(60649): DDX11L1 MIR1302-2HG ... MT-TE MT-TP
rowData names(5): gene_ids gene_name chromosome gene_type
gene_type_ribo
colnames(85354): 4861STDY7387181_AGAGTGGGTCTAGCGC
4861STDY7387181_CATCAAGTCCTGCTTG ...
Huang_GSM7277298_ATTTACCGTGGTTTGT Huang_GSM7277298_AATGGCTAGACACACG
colData names(49): barcodes sample ... HECA_Lineage HECA_Label
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
Check total count for each gene type and removed unwanted gene types
keepGeneTypes <- c("protein_coding", "IG_V_gene", "IG_C_gene", "IG_J_gene",
"TR_C_gene", "TR_J_gene", "TR_V_gene", "TR_D_gene",
"IG_D_gene", "Mt_tRNA", "Mt_rRNA")
pd <- data.frame(gene = rownames(dat),
gene_type_ribo = rowData(dat)$gene_type_ribo,
total_count = rowSums(counts(dat))) |>
group_by(gene_type_ribo) |>
summarize(nbr_genes = length(gene),
total_count = sum(total_count)) |>
mutate(keep = gene_type_ribo %in% keepGeneTypes) |>
arrange(desc(nbr_genes))
pd$gene_type_ribo <- factor(pd$gene_type_ribo, levels = unique(pd$gene_type_ribo))
p1 <- ggplot(pd, aes(x = gene_type_ribo, y = nbr_genes, fill = keep)) +
geom_col() +
labs(x = "", y = "Number of genes", fill = "keep") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- ggplot(pd, aes(x = gene_type_ribo, y = total_count, fill = keep)) +
geom_col() +
labs(x = "", y = "Total UMI count", fill = "keep") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot_grid(p1, p2, ncol = 1)dat <- dat[rowData(dat)$gene_type_ribo %in% keepGeneTypes, ]
dim(dat)[1] 20231 85354
Add QC metrics
(MT <- grep("^MT-", rownames(dat), value = TRUE)) [1] "MT-TF" "MT-RNR1" "MT-TV" "MT-RNR2" "MT-TL1" "MT-ND1" "MT-TI"
[8] "MT-TM" "MT-ND2" "MT-TW" "MT-CO1" "MT-TD" "MT-CO2" "MT-TK"
[15] "MT-ATP8" "MT-ATP6" "MT-CO3" "MT-TG" "MT-ND3" "MT-TR" "MT-ND4L"
[22] "MT-ND4" "MT-TH" "MT-TS2" "MT-TL2" "MT-ND5" "MT-CYB" "MT-TT"
[29] "MT-TQ" "MT-TA" "MT-TN" "MT-TC" "MT-TY" "MT-TS1" "MT-ND6"
[36] "MT-TE" "MT-TP"
dat <- scuttle::addPerCellQC(dat, assay.type = "counts", subsets = list(MT = MT))
dat <- scuttle::addPerFeatureQC(dat, assay.type = "counts")
dat$intronic_fraction <-
colSums(assay(dat, "unspliced")) / colSums(assay(dat, "total"))
rowData(dat)$intronic_fraction <-
rowSums(assay(dat, "unspliced")) / rowSums(assay(dat, "total"))QC plots
total_count_threshold <- 1000
detected_threshold <- 500
mt_percent_threshold <- 25
dat$keep <- (dat$total > total_count_threshold) &
(dat$detected > detected_threshold) &
(dat$subsets_MT_percent < mt_percent_threshold)
table(dat$keep)
FALSE TRUE
35765 49589
cdplot <- as.data.frame(colData(dat))
sid <- "sample"
stid <- "StageFactor"
ggplot(cdplot, aes(x = .data[[sid]], y = sum, fill = .data[[stid]])) +
geom_hline(yintercept = total_count_threshold) +
geom_violin(scale = "width") + theme_minimal() +
labs(y = "Total UMI count per cell") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))ggplot(cdplot, aes(x = .data[[sid]], y = sum, fill = .data[[stid]])) +
geom_hline(yintercept = total_count_threshold) +
geom_violin(scale = "width") + theme_minimal() +
scale_y_log10() +
labs(y = "Total UMI count per cell") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))ggplot(cdplot, aes(x = .data[[sid]], y = detected, fill = .data[[stid]])) +
geom_hline(yintercept = detected_threshold) +
geom_violin(scale = "width") + theme_minimal() +
labs(y = "Number of detected genes per cell") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))ggplot(cdplot, aes(x = .data[[sid]], y = subsets_MT_percent, fill = .data[[stid]])) +
geom_hline(yintercept = mt_percent_threshold) +
geom_violin(scale = "width") + theme_minimal() +
labs(y = "Percentage of MT count per cell") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))ggplot(cdplot, aes(x = detected, y = subsets_MT_percent, color = keep)) +
geom_point(size = 0.5, alpha = 0.5) + theme_minimal() +
labs(x = "Number of detected genes per cell",
y = "Percentage of MT count per cell")ggplot(cdplot, aes(x = detected, y = intronic_fraction, color = keep)) +
geom_point(size = 0.5, alpha = 0.5) + theme_minimal() +
labs(x = "Number of detected genes per cell",
y = "Intronic fraction per cell")Filter and remove samples with too few cells
(tbl <- table(sample = dat$sample, keep = dat$keep)) keep
sample FALSE TRUE
4861STDY7387181 474 415
4861STDY7387182 454 393
4861STDY7387183 10 3
4861STDY7771115 44 3946
4861STDY7771123 12 822
GSM4577306 1506 791
GSM4577307 2427 2142
GSM4577308 1947 4972
GSM4577309 12015 14387
GSM4577310 2805 2512
GSM4577311 2717 2171
GSM4577312 1099 2595
GSM4577313 85 172
GSM4577314 1749 1219
GSM4577315 1352 1812
Huang_GSM6605437 976 2730
Huang_GSM6605438 442 710
Huang_GSM6605439 304 1604
Huang_GSM6605440 408 1434
Huang_GSM7277296 439 1876
Huang_GSM7277297 225 657
Huang_GSM7277298 137 1127
Lai_GSM5572238 5 19
Lai_GSM5572239 79 6
Lai_GSM5572240 167 13
MRC_Endo8625698 114 3
MRC_Endo8625699 936 90
MRC_Endo8712024 133 26
MRC_Endo8712032 215 2
MRC_Endo8715415 123 119
MRC_Endo8715416 2252 643
Shih_MEtissue1 41 53
Shih_MEtissue2 5 13
Shih_MEtissue3 68 112
ggplot(as.data.frame(tbl), aes(x = sample, y = Freq, fill = keep)) +
geom_col(position = "stack") + theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "", y = "Number of cells")ggplot(as.data.frame(tbl), aes(x = sample, y = Freq, fill = keep)) +
geom_col(position = "fill") + theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "", y = "Fraction of cells")dat <- dat[, which(dat$keep)]
dim(dat)[1] 20231 49589
(tbl <- sort(table(dat$sampleCol)))
MRC_Endo8712032 4861STDY7387183 MRC_Endo8625698 Lai_GSM5572239
2 3 3 6
Lai_GSM5572240 TB13021061 Lai_GSM5572238 MRC_Endo8712024
13 13 19 26
TB13021372 MRC_Endo8625699 TB13020851 MRC_Endo8715415
53 90 112 119
GSM4577313 4861STDY7387182 4861STDY7387181 MRC_Endo8715416
172 393 415 643
Huang_GSM7277297 Huang_GSM6605438 GSM4577306 4861STDY7771123
657 710 791 822
Huang_GSM7277298 GSM4577314 Huang_GSM6605440 Huang_GSM6605439
1127 1219 1434 1604
GSM4577315 Huang_GSM7277296 GSM4577307 GSM4577311
1812 1876 2142 2171
GSM4577310 GSM4577312 Huang_GSM6605437 4861STDY7771115
2512 2595 2730 3946
GSM4577308 GSM4577309
4972 14387
keepsamples <- names(tbl)[tbl > 30]
dat <- dat[, dat$sampleCol %in% keepsamples]
dim(dat)[1] 20231 49504
tbl <- table(dat[[sid]], dat[[stid]])
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
cluster_columns = FALSE, cluster_rows = FALSE,
row_names_gp = gpar(fontsize = 8),
name = "Number of\ncells per\nsample",
rect_gp = gpar(col = "grey", lwd = 1),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
}))Recalculate QC metrics for genes based on remaining cells
dat <- scuttle::addPerFeatureQC(dat, assay.type = "counts")
rowData(dat)$intronic_fraction <-
rowSums(assay(dat, "unspliced")) / rowSums(assay(dat, "total"))Most highly expressed genes
scater::plotHighestExprs(dat, n = 25, colour_cells_by = stid,
exprs_values = "counts")Calculate size factors and logcounts
size_factor_threshold <- 0.1
set.seed(123)
clusters <- scran::quickCluster(
dat, method = "igraph", min.mean = 0.1,
use.ranks = FALSE, BSPARAM = BiocSingular::ExactParam(),
block = dat[[datasetvar]],
min.size = min(table(dat[[datasetvar]]))
)
dat <- scran::computeSumFactors(dat, min.mean = 0.1, cluster = clusters)
summary(sizeFactors(dat)) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.06493 0.28916 0.59723 1.00000 1.34411 15.57060
ggplot(data.frame(libSize = dat$sum,
sizeFactor = sizeFactors(dat),
group = dat[[blockvar]],
stringsAsFactors = FALSE),
aes(x = libSize, y = sizeFactor, color = group)) +
geom_point(alpha = 0.5) + theme_bw() +
geom_hline(yintercept = size_factor_threshold, linetype = "dashed") +
xlab("Total UMI count") + ylab("Size factor")dat <- dat[, sizeFactors(dat) > size_factor_threshold]
dim(dat)[1] 20231 48922
dat <- batchelor::multiBatchNorm(dat, batch = dat[[batchvar]], min.mean = 0.1)
ggplot(data.frame(libSize = dat$sum,
sizeFactor = sizeFactors(dat),
group = dat[[blockvar]],
stringsAsFactors = FALSE),
aes(x = libSize, y = sizeFactor, color = group)) +
geom_point(alpha = 0.5) + theme_bw() +
geom_hline(yintercept = size_factor_threshold, linetype = "dashed") +
xlab("Total UMI count") + ylab("Size factor")Cell cycle assignment
## tricycle
set.seed(123)
dat <- project_cycle_space(dat, species = "human", gname.type = "SYMBOL")
dat <- estimate_cycle_position(dat)
dat <- estimate_Schwabe_stage(dat, gname.type = "SYMBOL", species = "human",
batch.v = batchvar, tolerance = 0)
plotReducedDim(dat, "tricycleEmbedding", point_size = 0.5,
colour_by = "CCStage")tbl <- table(dat$StageFactor, dat$CCStage)
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
cluster_columns = FALSE, cluster_rows = FALSE,
row_names_gp = gpar(fontsize = 8),
name = "Number of\ncells",
rect_gp = gpar(col = "grey", lwd = 1),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
}))## Seurat (using genes from HECA)
s_genes <- read.delim(s_genes)[, 1]
g2m_genes <- read.delim(g2m_genes)[, 1]
seurat <- CreateSeuratObject(counts = assay(dat, "counts"))
seurat <- NormalizeData(seurat, verbose = FALSE)
seurat <- FindVariableFeatures(seurat, selection.method = "vst", verbose = FALSE)
seurat <- ScaleData(seurat, features = rownames(seurat), verbose = FALSE)
seurat <- CellCycleScoring(seurat, s.features = s_genes, g2m.features = g2m_genes,
set.ident = TRUE)
stopifnot(all(rownames(seurat@meta.data) == colnames(dat)))
dat$SeuratCC <- seurat@meta.data$Phase
tbl <- table(dat$StageFactor, dat$SeuratCC)
draw(Heatmap(tbl, col = circlize::colorRamp2(c(0, max(tbl)), c("white", "steelblue")),
cluster_columns = FALSE, cluster_rows = FALSE,
row_names_gp = gpar(fontsize = 8),
name = "Number of\ncells",
rect_gp = gpar(col = "grey", lwd = 1),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(sprintf("%d", tbl[i, j]), x, y, gp = gpar(fontsize = 8))
}))Find and mark doublets
set.seed(123)
dat <- scDblFinder(dat, samples = blockvar, verbose = FALSE,
BPPARAM = MulticoreParam(workers = 8L))
table(dat$scDblFinder.class, dat[[blockvar]])
4861STDY7387181 4861STDY7387182 4861STDY7771115 4861STDY7771123
singlet 380 374 3698 764
doublet 35 19 248 28
GSM4577306 GSM4577307 GSM4577308 GSM4577309 GSM4577310 GSM4577311
singlet 758 2013 4421 12408 2356 2042
doublet 31 111 384 1657 136 129
GSM4577312 GSM4577313 GSM4577314 GSM4577315 Huang_GSM6605437
singlet 2442 156 1159 1723 2539
doublet 139 16 59 82 191
Huang_GSM6605438 Huang_GSM6605439 Huang_GSM6605440 Huang_GSM7277296
singlet 665 1525 1361 1780
doublet 45 79 73 96
Huang_GSM7277297 Huang_GSM7277298 MRC_Endo8625699 MRC_Endo8715415
singlet 633 1077 83 113
doublet 24 50 7 5
MRC_Endo8715416 Shih_MEtissue1 Shih_MEtissue3
singlet 620 49 98
doublet 23 4 14
Sketching
We find highly variable genes while blocking on the batch, to focus on the biological variance.
set.seed(123)
sketchcells.bybatch <- unlist(lapply(unique(colData(dat)[[batchvar]]), function(b) {
subd <- dat[, which(colData(dat)[[batchvar]] == b)]
geneVar <- scran::modelGeneVar(subd)
subhvgs <- scran::getTopHVGs(geneVar, n = 2000)
subd <- scater::runPCA(subd, exprs_values = "logcounts", ncomponents = 30,
name = "PCAsub",
subset_row = subhvgs, scale = FALSE)
idxsub <- geosketch(reducedDim(subd, "PCAsub"),
N = min(ncol(subd), max(0.1 * ncol(subd), 150)),
seed = 123)
colnames(subd)[idxsub]
}))Warning in (function (A, nv = 5, nu = nv, maxit = 1000, work = nv + 7, reorth =
TRUE, : You're computing too large a percentage of total singular values, use a
standard svd instead.
dat$sketched.bybatch <- ifelse(colnames(dat) %in% sketchcells.bybatch, TRUE, FALSE)
table(batch = colData(dat)[[batchvar]], sketched.bybatch = dat$sketched.bybatch) sketched.bybatch
batch FALSE TRUE
A13 658 150
A30 4265 473
E1 0 90
E3 611 150
GSM4577306 639 150
GSM4577307 1912 212
GSM4577308 4325 480
GSM4577309 12659 1406
GSM4577310 2243 249
GSM4577311 1954 217
GSM4577312 2323 258
GSM4577313 22 150
GSM4577314 1068 150
GSM4577315 1625 180
GSM6605437 2457 273
GSM6605438 560 150
GSM6605439 1444 160
GSM6605440 1284 150
GSM7277296 1689 187
GSM7277297 507 150
GSM7277298 977 150
TB13020851 0 112
TB13021372 0 53
Find an alternative reduced dimension representation, after sketching
set.seed(123)
## First find a new set of HVGs
dattmp <- dat[, sketchcells.bybatch]
stopifnot(all(rownames(dattmp) == rownames(dat)))
rowData(dat)$scran_geneVar_geosketch <-
scran::modelGeneVar(dattmp, block = dattmp[[datasetvar]])
hvgs <- scran::getTopHVGs(rowData(dat)$scran_geneVar_geosketch, n = 3000)
metadata(dat)$hvg.scran.genevar.geosketch <- hvgs
head(
as.data.frame(
rowData(dat)$scran_geneVar_geosketch[
order(rowData(dat)$scran_geneVar_geosketch$bio,
decreasing = TRUE),
c("mean", "total", "tech", "bio", "p.value", "FDR")]
),
25
) mean total tech bio p.value FDR
PAEP 2.1066383 6.4098906 0.9501136 5.4597770 0.000000e+00 0.000000e+00
CXCL14 0.7296767 2.2408316 0.3970415 1.8437901 4.893356e-310 4.293430e-306
SCGB2A1 1.7646943 2.5129607 0.7697078 1.7432528 6.521461e-62 2.200742e-59
MT1G 1.1138977 2.0830653 0.4075629 1.6755024 6.560409e-163 1.279134e-159
SLPI 2.0791303 2.5438547 0.9892680 1.5545867 2.625436e-45 6.581593e-43
SCGB1D2 0.9692065 1.9679226 0.4401188 1.5278039 7.133695e-120 6.954560e-117
MT2A 1.7135994 2.2420725 0.7701062 1.4719663 4.255256e-62 1.493425e-59
CXCL8 1.2918438 1.9789657 0.6835757 1.2953899 1.351486e-77 6.080993e-75
WFDC2 2.5473116 2.3303857 1.0515430 1.2788427 1.691466e-23 1.890563e-21
S100A6 1.9674850 2.0970420 0.9120803 1.1849616 1.038733e-33 1.804721e-31
VIM 1.5813201 1.9106186 0.7476998 1.1629188 5.670487e-38 1.124438e-35
SOD2 2.1511436 2.0255834 0.9012891 1.1242943 1.635633e-38 3.337452e-36
CRYAB 0.9518985 1.5469181 0.5259863 1.0209318 8.073129e-72 3.079723e-69
MMP7 0.7212142 1.3917734 0.4142810 0.9774924 2.849418e-89 1.612954e-86
HSPA1A 1.4907465 1.7464946 0.7769755 0.9695191 9.968352e-27 1.356005e-24
IGFBP7 1.1095376 1.5747259 0.6082635 0.9664624 2.016097e-39 4.478286e-37
IER3 2.0451952 1.7394000 0.8661273 0.8732727 2.205551e-17 1.751268e-15
TM4SF1 1.5680490 1.6325102 0.7754204 0.8570898 7.623575e-33 1.286332e-30
CAPS 1.0299802 1.3657216 0.5303950 0.8353266 7.882191e-77 3.457917e-74
FOS 2.1156574 1.6169544 0.8387706 0.7781838 1.874416e-12 9.199434e-11
RIMKLB 0.9138842 1.2786274 0.5004791 0.7781483 5.856457e-49 1.581063e-46
GPX3 0.5820514 1.1166279 0.3528173 0.7638106 9.797900e-83 4.775932e-80
CCL20 0.4396294 1.0206143 0.2808581 0.7397562 4.732702e-131 4.885262e-128
CXCL2 0.9461257 1.2510548 0.5248689 0.7261860 3.930548e-38 7.927960e-36
SCGB1D4 0.3711827 0.9053546 0.2023049 0.7030497 1.417861e-136 1.555039e-133
## Next run multibatch PCA on the sketched cells
set.seed(123)
dattmp <- dat[, sketchcells.bybatch]
mbpca <- multiBatchPCA(dattmp, batch = dattmp[[batchvar]], d = 30,
subset.row = metadata(dat)$hvg.scran.genevar.geosketch,
weights = FALSE, preserve.single = TRUE)
projection <- mbpca[[1]]
rotation <- metadata(mbpca)$rotation
centers <- metadata(mbpca)$centers
## Sanity check
tmp <- t(sweep(assay(dattmp[names(centers), ], "logcounts"), MARGIN = 1,
STATS = centers, FUN = "-")) %*% rotation
max(abs(tmp - projection))[1] 1.048051e-13
## Project all cells
pca <- t(sweep(assay(dat[names(centers), ], "logcounts"), MARGIN = 1,
STATS = centers, FUN = "-")) %*% rotationWarning in asMethod(object): sparse->dense coercion: allocating vector of size
1.1 GiB
## Perform batch correction in the PCA space
stopifnot(all(rownames(pca) == colnames(dat)))
reducedDim(dat, "multibatchPCA") <- as.matrix(pca)
mnn <- reducedMNN(as.matrix(pca), batch = dat[[batchvar]], k = 20,
BPPARAM = MulticoreParam(workers = 8L))
stopifnot(all(rownames(mnn) == colnames(dat)))
reducedDim(dat, "fastMNNcorr_sketch") <- mnn$correctedDimension reduction
Apply tSNE and UMAP to sketched cells and project the others.
## TSNE
set.seed(123)
tsne <- snifter::fitsne(
reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"),
n_jobs = 8L, random_state = 123L,
perplexity = 30, n_components = 2L,
pca = FALSE)
proj <- snifter::project(
x = tsne,
new = reducedDim(dat, "fastMNNcorr_sketch"),
old = reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"))
rownames(proj) <- colnames(dat)
reducedDim(dat, "TSNE_fastMNNcorr_sketch") <- proj
## UMAP
set.seed(123)
umap <- uwot::umap(
X = reducedDim(dat[, sketchcells.bybatch], "fastMNNcorr_sketch"),
n_components = 2, n_neighbors = 50,
pca = NULL, scale = FALSE,
ret_model = TRUE, n_threads = 8L)
proj <- uwot::umap_transform(
X = reducedDim(dat, "fastMNNcorr_sketch"),
model = umap,
n_threads = 8L)
rownames(proj) <- colnames(dat)
reducedDim(dat, "UMAP_fastMNNcorr_sketch") <- projShuffle cells (to avoid plotting in layers by sample later)
set.seed(123)
dat <- dat[, sample(seq_len(ncol(dat)), ncol(dat), replace = FALSE)]Clustering
## Helper function
## method should be "louvain" or "leiden"
clusterfcn <- function(sce, dimred = "fastMNNcorr", method = "leiden",
resolutions = c(0.05, 0.1, 0.2, 0.4, 0.6, 1.0),
sketched_cells = NULL) {
if (is.null(sketched_cells)) {
for (r in resolutions) {
set.seed(123)
mat <- reducedDim(sce, dimred)
clst <- bluster::clusterRows(
mat,
BLUSPARAM = bluster::SNNGraphParam(
k = 10, type = "rank",
num.threads = 1L,
cluster.fun = method,
cluster.args = list(resolution = r)))
colData(sce)[[paste0(method, "_res", r)]] <- as.character(clst)
}
} else {
train <- reducedDim(sce[, sketched_cells], dimred)
test <- reducedDim(sce[, setdiff(colnames(sce), sketched_cells)], dimred)
for (r in resolutions) {
set.seed(123)
df <- data.frame(cell = colnames(sce), cluster = NA_character_)
clst <- as.character(bluster::clusterRows(
train,
BLUSPARAM = bluster::SNNGraphParam(
k = 10, type = "rank",
num.threads = 1L,
cluster.fun = method,
cluster.args = list(resolution = r))))
preds <- class::knn(train = train, test = test,
cl = clst, k = 1) |> as.character()
df$cluster[match(rownames(train), df$cell)] <- clst
df$cluster[match(rownames(test), df$cell)] <- preds
stopifnot(colnames(sce) == df$cell)
colData(sce)[[paste0(method, "_res", r)]] <- df$cluster
}
}
sce
}
dat <- clusterfcn(dat, dimred = "fastMNNcorr_sketch", method = "leiden",
resolutions = 0.4,
sketched_cells = sketchcells.bybatch)Annotate clusters
clust_annot <- stack(
list("pre-ciliated" = 2,
"ciliated" = c(1, 3, 8, 30),
"secretory late" = 41,
"secretory mid" = c(22, 25, 35, 34, 40, 38, 21, 18, 19, 23),
"secretory early" = c(26, 31, 36, 37),
"luminal late" = c(45, 28, 32, 46, 20, 33),
"transcriptionally active" = c(15, 10, 39),
"luminal early" = 14,
"MUC5B+" = 42,
"KRT5+" = 43,
"remodeling" = c(16, 24, 13),
"cycling" = c(4, 29, 6, 11, 12, 9),
"hormone responsive early" = c(5, 7, 47),
"hormone responsive late" = c(17, 44, 27))
)
dat$leiden_res0.4_annot <- clust_annot$ind[match(dat$leiden_res0.4,
as.character(clust_annot$values))]
table(dat$leiden_res0.4_annot)
pre-ciliated ciliated secretory late
178 3595 1265
secretory mid secretory early luminal late
11291 4299 10567
transcriptionally active luminal early MUC5B+
1402 1274 152
KRT5+ remodeling cycling
81 734 6035
hormone responsive early hormone responsive late
4874 3175
leiden_res0.4_annot_cols <- c(
"pre-ciliated" = "lightskyblue",
"ciliated" = "royalblue",
"secretory early" = "#C8AEE2",
"secretory mid" = "#7D4DBA",
"secretory late" = "#3A1260",
"luminal early" = "gold1",
"luminal late" = "darkorange",
"transcriptionally active" = "turquoise4",
"MUC5B+" = "salmon",
"KRT5+" = "firebrick",
"remodeling" = "#4CAF50",
"cycling" = "mediumaquamarine",
"hormone responsive early" = "hotpink2",
"hormone responsive late" = "hotpink4"
) Dot plots of clusters and marker genes of interest
col = circlize::colorRamp2(
breaks = seq(0, 7.5, length.out = 64),
colors = colorRampPalette(c("grey90", "purple"))(64))
markers <- intersect(
c("MKI67", "MMP7", "STMN1", "PKM", "ALDH1A1", "VIM", "LGR5", "LEFTY1",
"IL6", "ENPP3", "WNT7A", "IL32", "SAA1", "SAA2", "CLU", "MUC5B", "LCN2",
"LY6E", "S100A6", "PLAUR", "PLAU", "SFN", "TNFRSF12A", "IL23A", "KRT17",
"KRT5", "ANXA1", "CXCL8", "SCGB2A1", "SBGB2A2", "SCGB1D2", "IL6ST",
"PAEP", "FGF7", "PGR", "ESR1", "CCNO", "FOXJ1", "PIFO", "RSPH1", "DKK1"),
rownames(dat))
plotDots(dat, features = markers,
group = "leiden_res0.4_annot",
swap_rownames = "gene_name", zlim = c(0, 7.5),
colour = col(seq(0, 7.5, length.out = 64))) +
scale_size(limits=c(0.2, 1)) +
labs(x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))Marker genes from HECA paper (https://www.nature.com/articles/s41588-024-01873-w, Fig 2a).
markers <- intersect(
c("PAX8", "EPCAM", "FOXA2", "MMP7", "SOX9", "AXIN2", "TRH", "ALDH1A1",
"CDH2", "KLK11", "SLC7A11", "DKK1", "PHLDA1", "KMO", "IHH", "EMID1",
"HPRT1", "SUFU", "OPRK1", "CBR3", "S100P", "SCGB2A2", "ABCG1", "PAEP",
"DPP4", "GPX3", "PTPRR", "FGF7", "FXYD2", "IL32", "WNT7A", "LGR5",
"MMP7", "TNF", "VTCN1", "CLDN22", "SULT1E1", "LEFTY1", "PTGS1", "IL6",
"CCNO", "CDC20B", "MUC12", "PIFO", "FOXJ1", "TP73", "BPIFB1", "TFF3",
"MUC5B", "SAA1", "TP63", "SNCG", "KRT5", "PGR", "AR", "ESR1"),
rownames(dat))
plotDots(dat, features = markers,
group = "leiden_res0.4_annot",
swap_rownames = "gene_name", zlim = c(0, 7.5),
colour = col(seq(0, 7.5, length.out = 64))) +
scale_size(limits=c(0.2, 1)) +
labs(x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))Plot dimension reductions
stage_colors <- c(
"early-secretory" = "#F4EBFA",
"early-mid-secretory" = "#C8AEE2",
"proliferative" = "hotpink2",
"mid-secretory" = "#7D4DBA",
"late-secretory" = "#3A1260",
"menstrual" = "#A30B2E"
)
seurat_colors <- c(
"S" = "gold",
"G1" = "grey80",
"G2M" = "brown"
)
plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch",
other_fields = "HECA_Celltype", point_size = 0.5) +
geom_point(aes(color = HECA_Celltype), size = 0.5) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
theme_void()for (vrb in c("RCA_Celltype", "RCA_Location", "RCA_BiopsyType",
"matchDataset", "StageCoarse", "donorCol",
"scDblFinder.class", "leiden_res0.4")) {
print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch",
colour_by = vrb, point_size = 0.5) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
theme_void())
}print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch",
colour_by = "leiden_res0.4_annot", point_size = 0.5) +
scale_color_manual(values = leiden_res0.4_annot_cols, name = "") +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
theme_void())Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch",
colour_by = "Stage", point_size = 0.5) +
scale_color_manual(values = stage_colors, name = "") +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
theme_void())Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch",
colour_by = "SeuratCC", point_size = 0.5) +
scale_color_manual(values = seurat_colors, name = "") +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
theme_void())Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
for (vrb in c("subsets_MT_percent", "detected", "intronic_fraction")) {
print(plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch",
colour_by = vrb, point_size = 0.5) +
theme_void())
}Plot cluster abundance over the cycle
col <- "leiden_res0.4_annot"
df <- data.frame(colData(dat)[, c("StageFactor", col)]) |>
group_by(StageFactor, .data[[col]]) |>
tally() |>
group_by(StageFactor) |>
mutate(perc = n / sum(n) * 100) |>
group_by(.data[[col]]) |>
mutate(scaledperc = perc / sum(perc)) |>
ungroup() |>
dplyr::filter(!is.na(StageFactor))
ggplot(df, aes(x = StageFactor, y = perc, color = .data[[col]])) +
geom_point() +
geom_line(aes(x = as.numeric(StageFactor)), linewidth = 1) +
theme_cowplot() +
labs(x = "", y = "Relative abundance (%)",
title = col) +
theme(legend.position = "bottom") +
scale_color_manual(values = leiden_res0.4_annot_cols, name = "")## One panel per cluster
ggplot(df, aes(x = StageFactor, y = perc, color = .data[[col]])) +
geom_point() +
geom_line(aes(x = as.numeric(StageFactor)), linewidth = 1) +
theme_cowplot() +
labs(x = "", y = "Relative abundance (%)",
title = col) +
facet_wrap(~ .data[[col]], scales = "free_y") +
scale_color_manual(values = leiden_res0.4_annot_cols, name = "") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
strip.text = element_text(size = 10))Find marker genes
First, we compare all clusters in a pairwise manner. The comparison is only performed for genes with a non-zero count in at least 0.05% of the cells (corresponding to 25 of the 48922 cells in the data set). These gene IDs are also written to a file, for potential use as the ‘universe’ in over-representation analyses.
frac_detected <- rowMeans(assay(dat, "counts") > 0)
genes_to_test <- rownames(dat)[frac_detected >= (0.05 / 100)]
write.table(genes_to_test, file = "epithelial_cell_atlas_gene_universe.txt",
row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
clc <- "leiden_res0.4_annot"
ptts <- pairwiseTTests(dat, groups = dat[[clc]],
block = dat[[blockvar]], direction = "up",
subset.row = genes_to_test)
markers <- combineMarkers(ptts$statistics, ptts$pairs, pval.type = "all")
## Write results to files
ttestdir <- paste0("pairwise_ttests_", clc)
if (!file.exists(ttestdir)) {
dir.create(ttestdir, recursive = TRUE)
}
for (i in seq_len(nrow(ptts$pairs))) {
df <- cbind(gene = rownames(ptts$statistics[[i]]),
ptts$statistics[[i]])
df <- df[order(df$p.value, -df$logFC), ]
filename <- paste0(clc, "_cluster_", ptts$pairs$first[i],
"_vs_", ptts$pairs$second[i], ".txt")
write.table(df, file = file.path(ttestdir, filename), quote = FALSE,
sep = "\t", row.names = FALSE, col.names = TRUE)
}
markerdir <- paste0("marker_genes_all_", clc)
if (!file.exists(markerdir)) {
dir.create(markerdir, recursive = TRUE)
}
for (i in seq_along(markers)) {
df <- cbind(gene = rownames(markers[[i]]), markers[[i]])
filename <- paste0(clc, "_cluster_", names(markers)[i], "_markers.txt")
write.table(df, file = file.path(markerdir, filename), quote = FALSE,
sep = "\t", row.names = FALSE, col.names = TRUE)
}Save object
saveRDS(dat, file = "invivo_epithelial_cell_atlas.rds")Session info
sessionInfo()R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Rocky Linux 8.10 (Green Obsidian)
Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Zurich
tzcode source: system (glibc)
attached base packages:
[1] parallel grid stats4 stats graphics grDevices utils
[8] datasets methods base
other attached packages:
[1] DT_0.33 igraph_2.1.4
[3] BiocParallel_1.40.0 bluster_1.16.0
[5] org.Hs.eg.db_3.20.0 AnnotationDbi_1.68.0
[7] tricycle_1.14.0 sketchR_1.2.0
[9] Seurat_5.2.1 SeuratObject_5.0.2
[11] sp_2.2-0 scDblFinder_1.20.2
[13] circlize_0.4.16 ComplexHeatmap_2.22.0
[15] cowplot_1.1.3 batchelor_1.22.0
[17] msigdbr_7.5.1 tidyr_1.3.1
[19] dplyr_1.1.4 scran_1.34.0
[21] scater_1.34.0 ggplot2_3.5.1
[23] scuttle_1.16.0 SingleCellExperiment_1.28.1
[25] SummarizedExperiment_1.36.0 Biobase_2.66.0
[27] GenomicRanges_1.58.0 GenomeInfoDb_1.42.3
[29] IRanges_2.40.1 S4Vectors_0.44.0
[31] BiocGenerics_0.52.0 MatrixGenerics_1.18.1
[33] matrixStats_1.5.0 fishpond_2.12.0
loaded via a namespace (and not attached):
[1] spatstat.sparse_3.1-0 bitops_1.0-9
[3] httr_1.4.7 RColorBrewer_1.1-3
[5] doParallel_1.0.17 tools_4.4.2
[7] sctransform_0.4.1 R6_2.6.1
[9] ResidualMatrix_1.16.0 lazyeval_0.2.2
[11] uwot_0.2.3 GetoptLong_1.0.5
[13] withr_3.0.2 gridExtra_2.3
[15] progressr_0.15.1 cli_3.6.4
[17] Cairo_1.6-2 spatstat.explore_3.3-4
[19] fastDummies_1.7.5 labeling_0.4.3
[21] sass_0.4.9 mvtnorm_1.3-3
[23] spatstat.data_3.1-4 ggridges_0.5.6
[25] pbapply_1.7-2 Rsamtools_2.22.0
[27] dichromat_2.0-0.1 parallelly_1.42.0
[29] limma_3.62.2 RSQLite_2.3.9
[31] circular_0.5-1 generics_0.1.3
[33] shape_1.4.6.1 BiocIO_1.16.0
[35] crosstalk_1.2.1 gtools_3.9.5
[37] ica_1.0-3 spatstat.random_3.3-2
[39] Matrix_1.7-1 ggbeeswarm_0.7.2
[41] abind_1.4-8 lifecycle_1.0.4
[43] yaml_2.3.10 edgeR_4.4.2
[45] SparseArray_1.6.2 Rtsne_0.17
[47] blob_1.2.4 promises_1.3.2
[49] dqrng_0.4.1 crayon_1.5.3
[51] dir.expiry_1.14.0 miniUI_0.1.1.1
[53] lattice_0.22-6 beachmat_2.22.0
[55] KEGGREST_1.46.0 magick_2.8.5
[57] pillar_1.10.1 knitr_1.49
[59] metapod_1.14.0 boot_1.3-31
[61] rjson_0.2.23 xgboost_1.7.8.1
[63] future.apply_1.11.3 codetools_0.2-20
[65] glue_1.8.0 spatstat.univar_3.1-1
[67] data.table_1.17.0 vctrs_0.6.5
[69] png_0.1-8 spam_2.11-1
[71] gtable_0.3.6 assertthat_0.2.1
[73] cachem_1.1.0 xfun_0.51
[75] S4Arrays_1.6.0 mime_0.12
[77] survival_3.7-0 iterators_1.0.14
[79] statmod_1.5.0 fitdistrplus_1.2-2
[81] ROCR_1.0-11 nlme_3.1-166
[83] bit64_4.6.0-1 filelock_1.0.3
[85] RcppAnnoy_0.0.22 bslib_0.9.0
[87] irlba_2.3.5.1 vipor_0.4.7
[89] KernSmooth_2.23-24 colorspace_2.1-1
[91] DBI_1.2.3 tidyselect_1.2.1
[93] bit_4.5.0.1 compiler_4.4.2
[95] curl_6.2.1 BiocNeighbors_2.0.1
[97] basilisk.utils_1.18.0 DelayedArray_0.32.0
[99] plotly_4.10.4 rtracklayer_1.66.0
[101] scales_1.3.0.9000 lmtest_0.9-40
[103] stringr_1.5.1 digest_0.6.37
[105] goftest_1.2-3 spatstat.utils_3.1-2
[107] rmarkdown_2.29 basilisk_1.18.0
[109] XVector_0.46.0 htmltools_0.5.8.1
[111] pkgconfig_2.0.3 sparseMatrixStats_1.18.0
[113] fastmap_1.2.0 rlang_1.1.5
[115] GlobalOptions_0.1.2 htmlwidgets_1.6.4
[117] UCSC.utils_1.2.0 shiny_1.10.0
[119] DelayedMatrixStats_1.28.1 jquerylib_0.1.4
[121] farver_2.1.2 zoo_1.8-13
[123] jsonlite_1.9.0 BiocSingular_1.22.0
[125] RCurl_1.98-1.16 magrittr_2.0.3
[127] GenomeInfoDbData_1.2.13 dotCall64_1.2
[129] patchwork_1.3.0 Rcpp_1.0.14
[131] ggnewscale_0.5.1 babelgene_22.9
[133] viridis_0.6.5 reticulate_1.41.0
[135] stringi_1.8.4 zlibbioc_1.52.0
[137] MASS_7.3-61 plyr_1.8.9
[139] listenv_0.9.1 ggrepel_0.9.6
[141] deldir_2.0-4 Biostrings_2.74.1
[143] splines_4.4.2 tensor_1.5
[145] locfit_1.5-9.11 spatstat.geom_3.3-5
[147] RcppHNSW_0.6.0 reshape2_1.4.4
[149] ScaledMatrix_1.14.0 XML_3.99-0.18
[151] evaluate_1.0.3 snifter_1.16.0
[153] foreach_1.5.2 httpuv_1.6.15
[155] RANN_2.6.2 purrr_1.0.4
[157] polyclip_1.10-7 future_1.34.0
[159] clue_0.3-66 scattermore_1.2
[161] rsvd_1.0.5 xtable_1.8-4
[163] restfulr_0.0.15 svMisc_1.4.3
[165] RSpectra_0.16-2 later_1.4.1
[167] class_7.3-22 viridisLite_0.4.2
[169] tibble_3.2.1 memoise_2.0.1
[171] beeswarm_0.4.0 GenomicAlignments_1.42.0
[173] cluster_2.1.6 globals_0.16.3