Regroup reads — regroupReads • SingleMoleculeGenomicsIO

Regroup reads into new "samples", defined by readGroups.

Usage

regroupReads(se, readGroups)

regroupReadsByColData(se, colNames, withinSample = FALSE)

Arguments

se: A SummarizedExperiment object, e.g. obtained by readModBam. Rows represent positions, and columns samples. The object should have at least one read-level assay.
readGroups: A named list, where each element corresponds to a new group of reads (a "sample").
colNames: A character vector corresponding to the names of annotation (colData) columns, the combination of which represent the desired grouping of the reads. The names can be either columns of colData(se) itself, or columns in a nested (read-level) annotation column of colData(se). In the latter case, the colNames should be of the form outerColName:innerColName, where outerColName is a column name in colData(se), and innerColName is a column name in each element of colData(se)[[outerColName]].
withinSample: A logical scalar, indicating whether the regrouping should be done within each current sample (column of se) or not. If FALSE (default), reads are pooled across samples before being regrouped.

Value

A SummarizedExperiment object with columns corresponding to the names of readGroups. Only the read-level assays will be returned (as summary-level assays are likely no longer consistent with the regrouped read-level assays).

Author

Charlotte Soneson

Examples

library(SummarizedExperiment)
library(GenomicRanges)
modbamfiles <- system.file("extdata", c("6mA_1_10reads.bam",
                                        "6mA_2_10reads.bam"),
                           package = "SingleMoleculeGenomicsIO")
se <- readModBam(bamfiles = modbamfiles, regions = "chr1:6940000-6955000",
                 modbase = "a", verbose = TRUE,
                 variantPositions = GPos(seqnames = "chr1",
                                         pos = c(6940000, 6940500)),
                 BPPARAM = BiocParallel::SerialParam())
#> ℹ extracting base modifications from modBAM files
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ opening input file /Users/runner/work/_temp/Library/SingleMoleculeGenomicsIO/extdata/6mA_1_10reads.bam using 1 thread
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ reading alignments
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ removed 1724 unaligned (e.g. soft-masked) of 25090 called bases
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ read 3 alignments
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ opening input file /Users/runner/work/_temp/Library/SingleMoleculeGenomicsIO/extdata/6mA_2_10reads.bam using 1 thread
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ reading alignments
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ⠙   1 done (564/s) | 2ms
#> ⠙   1 done (131/s) | 8ms
#> 
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ removed 320 unaligned (e.g. soft-masked) of 10174 called bases
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ read 2 alignments
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ finding unique genomic positions...
#> ✔ finding unique genomic positions... [24ms]
#> 
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

#> ℹ collapsed 17739 positions to 7967 unique ones
#> ✔ collapsed 17739 positions to 7967 unique ones [194ms]
#> 
#> ⠙ 0.000 Mio. genomic positions processed (0.001 Mio./s) [1ms]

# number of reads per sample
lapply(assay(se, "mod_prob"), ncol)
#> $s1
#> [1] 3
#> 
#> $s2
#> [1] 2
#> 
# define groups
groups <- list(g1 = c("s1-233e48a7-f379-4dcf-9270-958231125563",
                      "s2-d03efe3b-a45b-430b-9cb6-7e5882e4faf8"),
               g2 = "s1-92e906ae-cddb-4347-a114-bf9137761a8d",
               g3 = c("s2-034b625e-6230-4f8d-a713-3a32cd96c298",
                      "s1-d52a5f6a-a60a-4f85-913e-eada84bfbfb9"))
# regroup reads
sere <- regroupReads(se, readGroups = groups)
lapply(assay(sere, "mod_prob"), ncol)
#> $g1
#> [1] 2
#> 
#> $g2
#> [1] 1
#> 
#> $g3
#> [1] 2
#> 

# regroup by read annotation (variant label)
# ... across samples
sere <- regroupReadsByColData(se, colNames = "readInfo:variant_label",
                              withinSample = FALSE)
sere
#> class: RangedSummarizedExperiment 
#> dim: 7967 2 
#> metadata(4): readLevelData variantPositions bamHeader filteredOutReads
#> assays(1): mod_prob
#> rownames: NULL
#> rowData names(0):
#> colnames(2): G- GT
#> colData names(4): sample modbase n_reads readInfo
lapply(assay(sere, "mod_prob"), ncol)
#> $`G-`
#> [1] 3
#> 
#> $GT
#> [1] 2
#> 
# ... within sample
sere <- regroupReadsByColData(se, colNames = "readInfo:variant_label",
                              withinSample = TRUE)
sere
#> class: RangedSummarizedExperiment 
#> dim: 7967 3 
#> metadata(4): readLevelData variantPositions bamHeader filteredOutReads
#> assays(1): mod_prob
#> rownames: NULL
#> rowData names(0):
#> colnames(3): s1-G- s1-GT s2-G-
#> colData names(4): sample modbase n_reads readInfo
lapply(assay(sere, "mod_prob"), ncol)
#> $`s1-G-`
#> [1] 1
#> 
#> $`s1-GT`
#> [1] 2
#> 
#> $`s2-G-`
#> [1] 2
#>