Deconvolve bulk timecourse with epithelial atlas

Published

July 1, 2025

Load packages

suppressPackageStartupMessages({
    library(SingleCellExperiment)
    library(scater)
    library(BayesPrism)
    library(pheatmap)
})

source("../params.R")

Define parameters

bulk <- "data/IVMC.raw_counts.20250403.tab"
sc <- "../scRNAseq_invivo_endometrium_epithelial_cell_atlas/invivo_epithelial_cell_atlas.rds"

Session info

sessionInfo()

R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Rocky Linux 8.10 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Zurich
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] pheatmap_1.0.12             BayesPrism_2.2.2           
 [3] NMF_0.28                    bigmemory_4.6.4            
 [5] cluster_2.1.6               rngtools_1.5.2             
 [7] registry_0.5-1              snowfall_1.84-6.3          
 [9] snow_0.4-4                  scater_1.34.1              
[11] ggplot2_3.5.2               scuttle_1.16.0             
[13] SingleCellExperiment_1.28.1 SummarizedExperiment_1.36.0
[15] Biobase_2.66.0              GenomicRanges_1.58.0       
[17] GenomeInfoDb_1.42.3         IRanges_2.40.1             
[19] S4Vectors_0.44.0            BiocGenerics_0.52.0        
[21] MatrixGenerics_1.18.1       matrixStats_1.5.0          

loaded via a namespace (and not attached):
 [1] bitops_1.0-9            gridExtra_2.3           rlang_1.1.6            
 [4] magrittr_2.0.3          gridBase_0.4-7          compiler_4.4.2         
 [7] vctrs_0.6.5             reshape2_1.4.4          stringr_1.5.1          
[10] shape_1.4.6.1           pkgconfig_2.0.3         crayon_1.5.3           
[13] fastmap_1.2.0           XVector_0.46.0          caTools_1.18.3         
[16] rmarkdown_2.29          UCSC.utils_1.2.0        ggbeeswarm_0.7.2       
[19] xfun_0.52               bluster_1.16.0          zlibbioc_1.52.0        
[22] beachmat_2.22.0         jsonlite_2.0.0          DelayedArray_0.32.0    
[25] uuid_1.2-1              BiocParallel_1.40.2     irlba_2.3.5.1          
[28] parallel_4.4.2          R6_2.6.1                stringi_1.8.7          
[31] RColorBrewer_1.1-3      limma_3.62.2            Rcpp_1.0.14            
[34] iterators_1.0.14        knitr_1.50              Matrix_1.7-1           
[37] igraph_2.1.4            tidyselect_1.2.1        dichromat_2.0-0.1      
[40] abind_1.4-8             yaml_2.3.10             viridis_0.6.5          
[43] doParallel_1.0.17       gplots_3.2.0            codetools_0.2-20       
[46] lattice_0.22-6          tibble_3.2.1            plyr_1.8.9             
[49] withr_3.0.2             evaluate_1.0.3          circlize_0.4.16        
[52] pillar_1.10.2           BiocManager_1.30.25     KernSmooth_2.23-24     
[55] foreach_1.5.2           generics_0.1.3          scales_1.3.0.9000      
[58] gtools_3.9.5            glue_1.8.0              metapod_1.14.0         
[61] tools_4.4.2             BiocNeighbors_2.0.1     ScaledMatrix_1.14.0    
[64] locfit_1.5-9.12         scran_1.34.0            grid_4.4.2             
[67] edgeR_4.4.2             colorspace_2.1-1        GenomeInfoDbData_1.2.13
[70] beeswarm_0.4.0          BiocSingular_1.22.0     vipor_0.4.7            
[73] cli_3.6.4               rsvd_1.0.5              bigmemory.sri_0.1.8    
[76] S4Arrays_1.6.0          viridisLite_0.4.2       dplyr_1.1.4            
[79] gtable_0.3.6            digest_0.6.37           dqrng_0.4.1            
[82] SparseArray_1.6.2       ggrepel_0.9.6           htmlwidgets_1.6.4      
[85] farver_2.1.2            htmltools_0.5.8.1       lifecycle_1.0.4        
[88] httr_1.4.7              GlobalOptions_0.1.2     statmod_1.5.0

Read data

bulk <- read.delim(bulk, header = TRUE)
sc <- readRDS(sc)

## Add/modify annotations for single-cell data
table(sc$donorCol, useNA = "ifany")


       A13        A30         E1         E3 GSM4577306 GSM4577307 GSM4577308 
       808       4738         90        761        789       2124       4805 
GSM4577309 GSM4577310 GSM4577311 GSM4577312 GSM4577313 GSM4577314 GSM4577315 
     14065       2492       2171       2581        172       1218       1805 
GSM6605437 GSM6605438 GSM6605439 GSM6605440 GSM7277296 GSM7277297 GSM7277298 
      2730        710       1604       1434       1876        657       1127 
TB13020851 TB13021372 
       112         53

sc$annotated_celltype <- as.character(sc$leiden_res0.4_annot)
(tbl <- table(sc$annotated_celltype))


                ciliated                  cycling hormone responsive early 
                    3595                     6035                     4874 
 hormone responsive late                    KRT5+            luminal early 
                    3175                       81                     1274 
            luminal late                   MUC5B+             pre-ciliated 
                   10567                      152                      178 
              remodeling          secretory early           secretory late 
                     734                     4299                     1265 
           secretory mid transcriptionally active 
                   11291                     1402

(celltypes_to_keep <- names(tbl)[tbl > 10])

 [1] "ciliated"                 "cycling"                 
 [3] "hormone responsive early" "hormone responsive late" 
 [5] "KRT5+"                    "luminal early"           
 [7] "luminal late"             "MUC5B+"                  
 [9] "pre-ciliated"             "remodeling"              
[11] "secretory early"          "secretory late"          
[13] "secretory mid"            "transcriptionally active"

sc <- sc[, sc$annotated_celltype %in% celltypes_to_keep]

scater::plotReducedDim(
    sc, dimred = "UMAP_fastMNNcorr_sketch",
    colour_by = "annotated_celltype"
) + scale_color_manual(values = tissue_leiden_res0.4_annot_colors, name = "")

Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Look at distribution of cell types across single-cell samples

tbl <- unclass(table(sc$annotated_celltype, sc$donorCol))
pheatmap::pheatmap(tbl)

pheatmap::pheatmap(sweep(tbl, MARGIN = 2, STATS = colSums(tbl), FUN = "/"))

Predict cell type composition

bulkmat <- t(as.matrix(bulk))
scmat <- t(as.matrix(assay(sc, "counts")))

Warning in asMethod(object): sparse->dense coercion: allocating vector of size
7.4 GiB

## Make sure gene names are the same
ints <- intersect(colnames(bulkmat), colnames(scmat))
length(ints)

[1] 20149

bulkmat <- bulkmat[, ints]
scmat <- scmat[, ints]
stopifnot(all(colnames(bulkmat) == colnames(scmat)))
dim(bulkmat)

[1]   114 20149

dim(scmat)

[1] 48922 20149

sc.stat <- plot.scRNA.outlier(
    input = scmat,
    cell.type.labels = sc$annotated_celltype,
    species = "hs",
    return.raw = TRUE
)

Gene symbols detected. Recommend to use EMSEMBLE IDs for more unique mapping.

bulk.stat <- plot.bulk.outlier(
    bulk.input = bulkmat, 
    sc.input = scmat,
    cell.type.labels = sc$annotated_celltype,
    species = "hs",
    return.raw = TRUE
)

Gene symbols detected. Recommend to use EMSEMBLE IDs for more unique mapping.

scmat.filt <- cleanup.genes(
    input = scmat, 
    input.type = "count.matrix",
    species = "hs",
    gene.group = c("Rb", "Mrp", "other_Rb", "chrM",
                   "MALAT1"),
    exp.cells = 5
)

Gene symbols detected. Recommend to use EMSEMBLE IDs for more unique mapping.
number of genes filtered in each category: 
      Rb      Mrp other_Rb     chrM   MALAT1 
       0       12        6       37        0 
A total of  54  genes from Rb Mrp other_Rb chrM MALAT1  have been excluded 
A total of  2576  gene expressed in fewer than  5  cells have been excluded

plot.bulk.vs.sc(
    sc.input = scmat.filt, 
    bulk.input = bulkmat
)

Gene symbols detected. Recommend to use EMSEMBLE IDs for more unique mapping.

scmat.filt.pc <- select.gene.type(
    scmat.filt, 
    gene.type = "protein_coding"
)

Gene symbols detected. Recommend to use EMSEMBLE IDs for more unique mapping.
number of genes retained in each category: 

protein_coding 
         16278

plot.cor.phi(
    input = scmat.filt.pc,
    input.labels = sc$annotated_celltype,
    title = "cell type correlation",
    cexRow = 0.8, cexCol = 0.8,
    margins = c(5, 5)
)

diff.exp.stat <- get.exp.stat(
    sc.dat = scmat[, colSums(scmat > 0) > 3],
    cell.type.labels = sc$annotated_celltype,
    cell.state.labels = sc$annotated_celltype,
    pseudo.count = 0.1, 
    cell.count.cutoff = 50,
    n.cores = 8L
)
scmat.filt.pc.sig <- select.marker(
    sc.dat = scmat.filt.pc,
    stat = diff.exp.stat,
    pval.max = 0.01,
    lfc.min = 0.1
)

number of markers selected for each cell type: 
cycling :  436 
luminal late :  137 
ciliated :  213 
hormone responsive early :  308 
hormone responsive late :  763 
secretory mid :  250 
secretory early :  81 
KRT5+ :  182 
transcriptionally active :  33 
luminal early :  163 
secretory late :  158 
remodeling :  0 
pre-ciliated :  1857 
MUC5B+ :  311

sc$annotated_celltype_grouped <- as.character(sc$annotated_celltype)

## Marker genes
stopifnot(all(rownames(scmat.filt.pc.sig) == colnames(sc)))
myPrism2 <- new.prism(
    reference = scmat.filt.pc.sig, 
    mixture = bulkmat, 
    input.type = "count.matrix",
    cell.type.labels = sc$annotated_celltype_grouped,
    cell.state.labels = sc$annotated_celltype_grouped,
    key = NULL, 
    outlier.cut = 0.01, 
    outlier.fraction = 0.1
)

number of cells in each cell state 
cell.state.labels
                   KRT5+                   MUC5B+             pre-ciliated 
                      81                      152                      178 
              remodeling           secretory late            luminal early 
                     734                     1265                     1274 
transcriptionally active  hormone responsive late                 ciliated 
                    1402                     3175                     3595 
         secretory early hormone responsive early                  cycling 
                    4299                     4874                     6035 
            luminal late            secretory mid 
                   10567                    11291 
No tumor reference is speficied. Reference cell types are treated equally. 
Number of outlier genes filtered from mixture = 12 
Aligning reference and mixture... 
Normalizing reference...

bpres_markers <- run.prism(
    prism = myPrism2, 
    n.cores = 10
)

Run Gibbs sampling... 
Current time:  2025-07-01 21:05:44.34839 
Estimated time to complete:  42mins 
Estimated finishing time:  2025-07-01 21:47:13.62839 
Start run...

Warning in searchCommandline(parallel, cpus = cpus, type = type, socketHosts =
socketHosts, : Unknown option on commandline: --file

R Version:  R version 4.4.2 (2024-10-31)

snowfall 1.84-6.3 initialized (using snow 0.4-4): parallel execution on 10 CPUs.


Stopping cluster

Update the reference matrix ...

Warning in searchCommandline(parallel, cpus = cpus, type = type, socketHosts =
socketHosts, : Unknown option on commandline: --file

snowfall 1.84-6.3 initialized (using snow 0.4-4): parallel execution on 10 CPUs.


Stopping cluster

Run Gibbs sampling using updated reference ... 
Current time:  2025-07-01 21:13:40.75125 
Estimated time to complete:  16mins 
Estimated finishing time:  2025-07-01 21:29:04.27125 
Start run...

Warning in searchCommandline(parallel, cpus = cpus, type = type, socketHosts =
socketHosts, : Unknown option on commandline: --file

snowfall 1.84-6.3 initialized (using snow 0.4-4): parallel execution on 10 CPUs.


Stopping cluster

saveRDS(
    bpres_markers, 
    file = "bayesprism_predicted_celltypes_marker_genes.rds"
)