suppressPackageStartupMessages({
library(SingleCellExperiment)
library(scuttle)
library(scater)
library(dplyr)
library(tidyr)
library(ggplot2)
library(cowplot)
library(ComplexHeatmap)
library(circlize)
library(ggtext)
library(gprofiler2)
library(DT)
library(forcats)
})
source("../params.R")
source("../helpers.R")
if (!file.exists("paper_figures")) {
dir.create("paper_figures", showWarnings = FALSE, recursive = FALSE)
}
if (!file.exists("gprofiler")) {
dir.create("gprofiler", showWarnings = FALSE, recursive = FALSE)
}Generating plots for paper (epithelial atlas)
Load packages
Load data
dat <- readRDS("invivo_epithelial_cell_atlas.rds")Numeric summaries
# number of genes and cells
dim(dat)[1] 20231 48922
# donors and samples
tmp <- colData(dat)[, c("donorCol", "sampleCol", "matchDataset")] |>
as.data.frame() |> distinct() |>
arrange(matchDataset, donorCol, sampleCol)
rownames(tmp) <- NULL
tmp donorCol sampleCol matchDataset
1 A13 4861STDY7387181 GarciaAlonso
2 A13 4861STDY7387182 GarciaAlonso
3 A30 4861STDY7771115 GarciaAlonso
4 A30 4861STDY7771123 GarciaAlonso
5 E1 MRC_Endo8625699 GarciaAlonso
6 E3 MRC_Endo8715415 GarciaAlonso
7 E3 MRC_Endo8715416 GarciaAlonso
8 GSM6605437 Huang_GSM6605437 Huang
9 GSM6605438 Huang_GSM6605438 Huang
10 GSM6605439 Huang_GSM6605439 Huang
11 GSM6605440 Huang_GSM6605440 Huang
12 GSM7277296 Huang_GSM7277296 Huang
13 GSM7277297 Huang_GSM7277297 Huang
14 GSM7277298 Huang_GSM7277298 Huang
15 TB13020851 TB13020851 Shih
16 TB13021372 TB13021372 Shih
17 GSM4577306 GSM4577306 Wang
18 GSM4577307 GSM4577307 Wang
19 GSM4577308 GSM4577308 Wang
20 GSM4577309 GSM4577309 Wang
21 GSM4577310 GSM4577310 Wang
22 GSM4577311 GSM4577311 Wang
23 GSM4577312 GSM4577312 Wang
24 GSM4577313 GSM4577313 Wang
25 GSM4577314 GSM4577314 Wang
26 GSM4577315 GSM4577315 Wang
# number of unique values in each column
vapply(tmp, function(x) length(unique(x)), NA_real_) donorCol sampleCol matchDataset
23 26 4
Define colors
dat$leiden_res0.4_annot <- factor(dat$leiden_res0.4_annot,
levels = tissue_leiden_res0.4_annot_order)Figure 2A (UMAP with clusters)
(umap_clusters <- plotReducedDim(dat, dimred = "UMAP_fastMNNcorr_sketch",
colour_by = "leiden_res0.4_annot",
point_size = 0.5) +
scale_color_manual(values = tissue_leiden_res0.4_annot_colors, name = "") +
guides(color = guide_legend(nrow = 4, byrow = FALSE, title = "",
override.aes = list(size = 5, alpha = 1,
shape = 15))) +
theme_void() +
theme(legend.text = element_text(size = 20),
legend.position = "bottom",
axis.title = element_blank()))Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
png("paper_figures/UMAP_tissue_clusters.png",
height = 4, width = 4, units = "in", res = 300)
umap_clusters + theme(legend.position = "none")
dev.off()png
2
pdf("paper_figures/UMAP_tissue_clusters_legend.pdf",
height = 1.75, width = 9)
grid.newpage()
grid.draw(get_legend2(umap_clusters + theme(legend.position = "bottom",
legend.text = element_text(size = 12))))
dev.off()png
2
pdf("paper_figures/UMAP_tissue_clusters.pdf", height = 4, width = 4)
umap_clusters + theme(legend.position = "none")
dev.off()png
2
Figure 2B (cluster abundance over cycle)
df <- data.frame(colData(dat)[, c("StageFactor", "leiden_res0.4_annot")]) |>
group_by(StageFactor, leiden_res0.4_annot) |>
tally() |>
group_by(StageFactor) |>
mutate(perc = n / sum(n) * 100) |>
dplyr::filter(!is.na(StageFactor)) |>
ungroup() |>
complete(StageFactor, leiden_res0.4_annot, fill = list(n = 0, perc = 0))
(clust_abund_plot <- ggplot(df, aes(x = StageFactor, y = perc,
color = leiden_res0.4_annot)) +
geom_point() +
geom_line(aes(x = as.numeric(StageFactor)), linewidth = 1) +
theme_cowplot() +
labs(x = "", y = "Relative abundance (%)") +
facet_wrap(~ leiden_res0.4_annot, scales = "free_y", nrow = 3) +
scale_color_manual(values = tissue_leiden_res0.4_annot_colors, name = "") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = "white", color = "white")))pdf("paper_figures/rel_abund_allclusters_tissue.pdf", height = 5, width = 11)
clust_abund_plot
dev.off()png
2
Figure 2C (dot plot)
markers <- intersect(tissue_markers, rownames(dat))
(dotplot_norm <- .makeDotPlot(
dat = dat, markers = markers,
cluster_column = "leiden_res0.4_annot",
cluster_order = tissue_leiden_res0.4_annot_order,
cluster_colors = tissue_leiden_res0.4_annot_colors,
transpose = TRUE
))Warning: Removed 448 rows containing missing values or values outside the scale range
(`geom_point()`).
pdf("paper_figures/dotplot_tissue_clusters_normalized.pdf", height = 6, width = 18)
dotplot_normWarning: Removed 448 rows containing missing values or values outside the scale range
(`geom_point()`).
dev.off()png
2
png("paper_figures/dotplot_tissue_clusters_normalized.png", height = 6, width = 18,
unit = "in", res = 300)
dotplot_normWarning: Removed 448 rows containing missing values or values outside the scale range
(`geom_point()`).
dev.off()png
2
Extended Data 5 (UMAPs)
(umap_matchdataset <- plotReducedDim(
dat, "UMAP_fastMNNcorr_sketch", colour_by = "matchDataset",
point_size = 1) +
scale_color_manual(values = dataset_colors) +
guides(color = guide_legend(override.aes = list(size = 4),
title = "Dataset")) +
theme_void() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
legend.key.size = unit(1.5, "lines")))Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
png("paper_figures/UMAP_tissue_matchDataset.png",
height = 4, width = 7, units = "in", res = 300)
plot_grid(
umap_matchdataset + theme(legend.position = "none"),
get_legend2(umap_matchdataset),
nrow = 1, rel_widths = c(4, 3)
)
dev.off()png
2
(umap_stage <- plotReducedDim(
dat, "UMAP_fastMNNcorr_sketch", colour_by = "StageFactor",
point_size = 1) +
scale_color_manual(values = stage_colors) +
guides(color = guide_legend(override.aes = list(size = 4),
title = "Stage")) +
theme_void() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
legend.key.size = unit(1.5, "lines")))Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
png("paper_figures/UMAP_tissue_stage.png",
height = 4, width = 7, units = "in", res = 300)
plot_grid(
umap_stage + theme(legend.position = "none"),
get_legend2(umap_stage),
nrow = 1, rel_widths = c(4, 3)
)
dev.off()png
2
(umap_cc <- plotReducedDim(
dat, "UMAP_fastMNNcorr_sketch", colour_by = "SeuratCC",
point_size = 1) +
scale_color_manual(values = cc_colors) +
guides(color = guide_legend(override.aes = list(size = 4),
title = "Cell cycle")) +
theme_void() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
legend.key.size = unit(1.5, "lines")))Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
png("paper_figures/UMAP_tissue_seuratCC.png",
height = 4, width = 7, units = "in", res = 300)
plot_grid(
umap_cc + theme(legend.position = "none"),
get_legend2(umap_cc),
nrow = 1, rel_widths = c(4, 3)
)
dev.off()png
2
(umap_mt <- plotReducedDim(
dat, "UMAP_fastMNNcorr_sketch", colour_by = "subsets_MT_percent",
point_size = 1) +
guides(color = guide_colorbar(title = "% MT")) +
theme_void() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
legend.key.size = unit(1.5, "lines")))png("paper_figures/UMAP_tissue_subsets_MT_percent.png",
height = 4, width = 7, units = "in", res = 300)
plot_grid(
umap_mt + theme(legend.position = "none"),
get_legend2(umap_mt),
nrow = 1, rel_widths = c(4, 3)
)
dev.off()png
2
(umap_tricycle <- plotReducedDim(
dat, "UMAP_fastMNNcorr_sketch", colour_by = "tricyclePosition",
point_size = 1) +
guides(color = guide_colorbar(title = "tricycle\nposition")) +
theme_void() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
legend.key.size = unit(1.5, "lines")))png("paper_figures/UMAP_tissue_tricyclePosition.png",
height = 4, width = 7, units = "in", res = 300)
plot_grid(
umap_tricycle + theme(legend.position = "none"),
get_legend2(umap_tricycle),
nrow = 1, rel_widths = c(4, 3)
)
dev.off()png
2
(umap_intronic <- plotReducedDim(
dat, "UMAP_fastMNNcorr_sketch", colour_by = "intronic_fraction",
point_size = 1) +
guides(color = guide_colorbar(title = "intronic\nfraction")) +
theme_void() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
legend.key.size = unit(1.5, "lines")))png("paper_figures/UMAP_tissue_intronic_fraction.png",
height = 4, width = 7, units = "in", res = 300)
plot_grid(
umap_intronic + theme(legend.position = "none"),
get_legend2(umap_intronic),
nrow = 1, rel_widths = c(4, 3)
)
dev.off()png
2
Dot plot, early vs late luminal markers
(dotplot_early_late <- .makeDotPlot(
dat = dat,
markers = c(luminal_early_genes, luminal_late_genes),
cluster_column = "leiden_res0.4_annot",
cluster_order = tissue_leiden_res0.4_annot_order,
cluster_colors = tissue_leiden_res0.4_annot_colors,
transpose = FALSE,
split_by_ciliated = FALSE,
normalize = FALSE,
subset_to_clusters = c("luminal late", "luminal early")))Warning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).
pdf("paper_figures/dotplot_tissue_luminal_early_late.pdf", height = 10, width = 4)
dotplot_early_lateWarning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).
dev.off()png
2
png("paper_figures/dotplot_tissue_luminal_early_late.png", height = 10, width = 4,
unit = "in", res = 300)
dotplot_early_lateWarning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).
dev.off()png
2
Figure 3A, Extended Data 6B (UMAPs)
for (gn in c("CD74", "IL32", "PLAU", "LGR5", "ENPP3", "IL6", "WNT7A",
"VIM", "EPCAM", "CDH1", "MUC5B", "PDGFRB")) {
umap_gn <- plotReducedDim(
dat, "UMAP_fastMNNcorr_sketch", colour_by = gn,
point_size = 1) +
scale_color_gradientn(colors = c("#d9d9d9", "#f4a582", "#d6604d","#8b0000", '#5c0000')) +
labs(color = gn) +
theme_void() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
legend.key.size = unit(1.5, "lines"))
print(umap_gn)
png(paste0("paper_figures/UMAP_tissue_", gn, ".png"),
height = 4, width = 6, units = "in", res = 300)
print(plot_grid(
umap_gn + theme(legend.position = "none"),
get_legend2(umap_gn),
nrow = 1, rel_widths = c(4, 2)
))
dev.off()
}Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Overrepresentation analysis
# tested genes
invivo_gene_bg <- read.delim("epithelial_cell_atlas_gene_universe.txt",
header = FALSE)[, 1]
clusters_to_consider <- c("luminal early",
"luminal late",
"transcriptionally active")
gprofiler2::set_base_url("https://biit.cs.ut.ee/gprofiler_archive3/e112_eg59_p19")
for (cl in clusters_to_consider) {
mgenes <- read.delim(paste0("marker_genes_all_leiden_res0.4_annot/leiden_res0.4_annot_cluster_",
cl, "_markers.txt")) |>
arrange(p.value) |>
slice_head(n = 100) |>
pull(gene)
gostres <- gost(
query = mgenes,
organism = "hsapiens",
ordered_query = FALSE,
multi_query = FALSE,
significant = TRUE,
exclude_iea = TRUE,
measure_underrepresentation = FALSE,
evcodes = FALSE,
user_threshold = 0.05,
correction_method = "fdr",
domain_scope = "custom",
custom_bg = invivo_gene_bg,
numeric_ns = "",
sources = c("GO:BP"),
as_short_link = FALSE,
highlight = TRUE
)
print(head(gostres$result |>
dplyr::select(term_name, p_value, term_size, query_size, intersection_size),
n = 15) |>
mutate(p_value = signif(p_value, digits = 3)))
write.table(gostres$result |>
dplyr::select(term_name, p_value, term_size, query_size, intersection_size),
file = paste0("gprofiler/", sub(" ", "_", cl), "_markers_gobp.tsv"),
row.names = FALSE, quote = FALSE, col.names = TRUE, sep = "\t")
if (length(tissue_goterms[[cl]]) > 0) {
stopifnot(all(tissue_goterms[[cl]] %in% gostres$result$term_name))
gg <- ggplot(gostres$result |>
filter(term_name %in% tissue_goterms[[cl]]),
aes(x = -log10(p_value), y = forcats::fct_reorder(term_name, -p_value))) +
geom_point(size = 4, color = tissue_leiden_res0.4_annot_colors[cl]) +
theme_cowplot() +
labs(x = "-log10 FDR", y = "", title = cl) +
coord_cartesian(xlim = c(0, NA))
print(gg)
pdf(paste0("paper_figures/gprofiler_tissue_", sub(" ", "_", cl), ".pdf"),
height = 4, width = 7)
print(gg)
dev.off()
}
} term_name p_value term_size query_size
1 cell adhesion 4.68e-05 1058 100
2 cell-cell adhesion 1.53e-04 669 100
3 response to stimulus 5.11e-03 5974 100
4 skin development 5.54e-03 126 100
5 epidermis development 5.54e-03 178 100
6 tissue development 5.54e-03 1094 100
7 cell migration 1.20e-02 1105 100
8 regulation of protein localization 1.20e-02 643 100
9 locomotion 3.29e-02 909 100
10 regulation of cellular localization 3.29e-02 730 100
11 cell communication 3.29e-02 4458 100
12 cell motility 3.82e-02 1244 100
13 regulation of cell adhesion 3.82e-02 573 100
14 animal organ development 4.15e-02 1491 100
15 cellular response to stimulus 4.15e-02 5142 100
intersection_size
1 24
2 18
3 59
4 7
5 8
6 20
7 19
8 14
9 16
10 14
11 45
12 19
13 12
14 21
15 49
term_name p_value
1 response to corticotropin-releasing hormone 0.000249
2 cellular response to corticotropin-releasing hormone stimulus 0.000249
3 negative regulation of macromolecule metabolic process 0.001780
4 regulation of programmed cell death 0.005060
5 negative regulation of metabolic process 0.005060
6 cell death 0.007130
7 programmed cell death 0.007130
8 regulation of apoptotic process 0.007130
9 negative regulation of signal transduction 0.007130
10 response to growth factor 0.007160
11 negative regulation of cellular process 0.007160
12 negative regulation of signaling 0.007920
13 apoptotic process 0.007920
14 negative regulation of cell communication 0.007920
15 negative regulation of programmed cell death 0.008750
term_size query_size intersection_size
1 3 99 3
2 3 99 3
3 2021 99 30
4 1005 99 19
5 2181 99 30
6 1283 99 21
7 1278 99 21
8 973 99 18
9 988 99 18
10 477 99 12
11 3843 99 42
12 1030 99 18
13 1228 99 20
14 1024 99 18
15 585 99 13
term_name p_value term_size
1 nucleic acid metabolic process 0.0163 4062
2 RNA splicing 0.0163 378
3 gene expression 0.0163 4713
4 cell-cell adhesion mediated by cadherin 0.0163 38
5 RNA metabolic process 0.0163 3605
6 peptidyl-serine phosphorylation 0.0163 164
7 peptidyl-serine modification 0.0163 176
8 macromolecule metabolic process 0.0163 6917
9 RNA biosynthetic process 0.0163 3427
10 nucleic acid biosynthetic process 0.0163 3517
11 negative regulation of hippo signaling 0.0163 14
12 hippo signaling 0.0205 43
13 regulation of primary metabolic process 0.0251 4341
14 nucleobase-containing compound metabolic process 0.0257 4504
15 positive regulation of cellular process 0.0372 4453
query_size intersection_size
1 97 44
2 97 10
3 97 47
4 97 4
5 97 39
6 97 7
7 97 7
8 97 62
9 97 38
10 97 38
11 97 3
12 97 4
13 97 43
14 97 44
15 97 43
Session info
# gProfiler/GO version
get_version_info(organism = "hsapiens")$gprofiler_version[1] "e112_eg59_p19_25aa4782"
get_version_info(organism = "hsapiens")$sources$`GO:MF`$name
[1] "molecular function"
$version
[1] "annotations: BioMart\nclasses: releases/2024-10-27"
sessionInfo()R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Rocky Linux 8.10 (Green Obsidian)
Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Zurich
tzcode source: system (glibc)
attached base packages:
[1] grid stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] forcats_1.0.0 DT_0.33
[3] gprofiler2_0.2.3 ggtext_0.1.2
[5] circlize_0.4.16 ComplexHeatmap_2.22.0
[7] cowplot_1.1.3 tidyr_1.3.1
[9] dplyr_1.1.4 scater_1.34.1
[11] ggplot2_3.5.2 scuttle_1.16.0
[13] SingleCellExperiment_1.28.1 SummarizedExperiment_1.36.0
[15] Biobase_2.66.0 GenomicRanges_1.58.0
[17] GenomeInfoDb_1.42.3 IRanges_2.40.1
[19] S4Vectors_0.44.0 BiocGenerics_0.52.0
[21] MatrixGenerics_1.18.1 matrixStats_1.5.0
loaded via a namespace (and not attached):
[1] bitops_1.0-9 gridExtra_2.3 rlang_1.1.6
[4] magrittr_2.0.3 clue_0.3-66 GetoptLong_1.0.5
[7] compiler_4.4.2 png_0.1-8 vctrs_0.6.5
[10] stringr_1.5.1 pkgconfig_2.0.3 shape_1.4.6.1
[13] crayon_1.5.3 fastmap_1.2.0 XVector_0.46.0
[16] labeling_0.4.3 rmarkdown_2.29 markdown_2.0
[19] UCSC.utils_1.2.0 ggbeeswarm_0.7.2 purrr_1.0.4
[22] xfun_0.52 bluster_1.16.0 zlibbioc_1.52.0
[25] beachmat_2.22.0 litedown_0.7 jsonlite_2.0.0
[28] DelayedArray_0.32.0 BiocParallel_1.40.2 irlba_2.3.5.1
[31] parallel_4.4.2 cluster_2.1.6 R6_2.6.1
[34] stringi_1.8.7 RColorBrewer_1.1-3 limma_3.62.2
[37] Rcpp_1.0.14 iterators_1.0.14 knitr_1.50
[40] Matrix_1.7-1 igraph_2.1.4 tidyselect_1.2.1
[43] dichromat_2.0-0.1 abind_1.4-8 yaml_2.3.10
[46] viridis_0.6.5 doParallel_1.0.17 codetools_0.2-20
[49] lattice_0.22-6 tibble_3.2.1 withr_3.0.2
[52] evaluate_1.0.3 xml2_1.3.8 pillar_1.10.2
[55] foreach_1.5.2 plotly_4.10.4 generics_0.1.3
[58] RCurl_1.98-1.17 commonmark_1.9.5 scales_1.3.0.9000
[61] glue_1.8.0 metapod_1.14.0 lazyeval_0.2.2
[64] tools_4.4.2 BiocNeighbors_2.0.1 data.table_1.17.0
[67] ScaledMatrix_1.14.0 locfit_1.5-9.12 scran_1.34.0
[70] edgeR_4.4.2 colorspace_2.1-1 GenomeInfoDbData_1.2.13
[73] beeswarm_0.4.0 BiocSingular_1.22.0 vipor_0.4.7
[76] cli_3.6.4 rsvd_1.0.5 S4Arrays_1.6.0
[79] viridisLite_0.4.2 gtable_0.3.6 digest_0.6.37
[82] SparseArray_1.6.2 ggrepel_0.9.6 dqrng_0.4.1
[85] rjson_0.2.23 htmlwidgets_1.6.4 farver_2.1.2
[88] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7
[91] statmod_1.5.0 GlobalOptions_0.1.2 gridtext_0.1.5