Generate a GRangesList object with genomic ranges for (any combination of) spliced transcripts, unspliced transcripts and introns.
getFeatureRanges(
gtf,
featureType = c("spliced", "intron"),
intronType = "separate",
flankLength = 90L,
joinOverlappingIntrons = FALSE,
collapseIntronsByGene = FALSE,
keepIntronInFeature = FALSE,
verbose = TRUE
)
Path to gtf file.
Character vector indicating the type(s) of features to
extract, any subset of c("spliced", "intron", "unspliced")
.
Character vector indicating how to define the introns
(only used if "intron" is part of featureType
). Has to be either
"separate" (introns are defined for each transcript separately) or
"collapse" (transcripts of the same gene are first collapsed before
introns are defined as any non-exonic part of the gene locus).
Integer scalar indicating the length of the flanking
sequence added to each side of each extracted intron (only used if
"intron" is included in featureType
).
Logical scalar indicating whether two introns that overlap (after adding the flanking sequences) should be joined into one feature.
Logical scalar indicating whether to collapse overlapping intron ranges by gene after extraction.
Logical scalar indicating whether introns
(after adding the flank length) should be restricted to stay within the
boundaries of the feature they were generated from. For backwards
compatibility, the default is FALSE
.
Logical scalar, whether to print out progress messages.
Returns a GRangesList
object where each element represents
one extracted feature. The metadata of this object contains two
data.frame
s mapping corresponding identifiers between the
different feature types, as well as a list of all features for each type.
## Get feature ranges
grl <- getFeatureRanges(
gtf = system.file("extdata/small_example.gtf", package = "eisaR"),
featureType = c("spliced", "intron"),
intronType = "separate",
flankLength = 5L,
joinOverlappingIntrons = FALSE,
collapseIntronsByGene = FALSE,
verbose = TRUE
)
#> Import genomic features from the file as a GRanges object ...
#> OK
#> Prepare the 'metadata' data frame ...
#> OK
#> Make the TxDb object ...
#> OK
#> 'select()' returned 1:1 mapping between keys and columns
#> Extracting spliced transcript features
#> Extracting introns using the separate approach
## GRangesList
grl
#> GRangesList object of length 17:
#> $tx1.3
#> GRanges object with 1 range and 5 metadata columns:
#> seqnames ranges strand | exon_id exon_rank transcript_id
#> <Rle> <IRanges> <Rle> | <character> <integer> <character>
#> [1] chr1 61-90 + | E6 1 tx1.3
#> gene_id type
#> <character> <character>
#> [1] g1 exon
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $tx1.2
#> GRanges object with 3 ranges and 5 metadata columns:
#> seqnames ranges strand | exon_id exon_rank transcript_id
#> <Rle> <IRanges> <Rle> | <character> <integer> <character>
#> [1] chr1 61-80 + | E3 1 tx1.2
#> [2] chr1 101-120 + | E4 2 tx1.2
#> [3] chr1 141-160 + | E5 3 tx1.2
#> gene_id type
#> <character> <character>
#> [1] g1 exon
#> [2] g1 exon
#> [3] g1 exon
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $tx1.1
#> GRanges object with 2 ranges and 5 metadata columns:
#> seqnames ranges strand | exon_id exon_rank transcript_id
#> <Rle> <IRanges> <Rle> | <character> <integer> <character>
#> [1] chr1 116-120 + | E1 1 tx1.1
#> [2] chr1 141-160 + | E2 2 tx1.1
#> gene_id type
#> <character> <character>
#> [1] g1 exon
#> [2] g1 exon
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> ...
#> <14 more elements>
## Corresponding transcript/gene IDs
S4Vectors::metadata(grl)$corrtx
#> spliced
#> 1 tx1.3
#> 2 tx1.2
#> 3 tx1.1
#> 4 tx2.1-I
#> 5 tx2.2-U
#> 6 tx3.3
#> 7 tx3.1
#> 8 tx3.2
S4Vectors::metadata(grl)$corrgene
#> spliced intron
#> 1 g1 g1-I
#> 2 g2-I g2-I-I
#> 3 g3-U g3-U-I
## List of features of different types
S4Vectors::metadata(grl)$featurelist
#> $spliced
#> [1] "tx1.3" "tx1.2" "tx1.1" "tx2.1-I" "tx2.2-U" "tx3.3" "tx3.1"
#> [8] "tx3.2"
#>
#> $intron
#> [1] "tx1.2-I" "tx1.2-I1" "tx1.1-I" "tx2.1-I-I" "tx2.2-U-I"
#> [6] "tx2.2-U-I1" "tx3.1-I" "tx3.2-I" "tx3.2-I1"
#>
## Get feature sequences
if (requireNamespace("BSgenome", quietly = TRUE) &&
requireNamespace("GenomicFeatures", quietly = TRUE)) {
library(BSgenome)
genome <- Biostrings::readDNAStringSet(
system.file("extdata/small_example_genome.fa", package = "eisaR"))
seqs <- GenomicFeatures::extractTranscriptSeqs(x = genome,
transcripts = grl)
seqs
}
#> Loading required package: BiocGenerics
#>
#> Attaching package: ‘BiocGenerics’
#> The following objects are masked from ‘package:stats’:
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from ‘package:base’:
#>
#> Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
#> as.data.frame, basename, cbind, colnames, dirname, do.call,
#> duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#> lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#> pmin.int, rank, rbind, rownames, sapply, saveRDS, setdiff, table,
#> tapply, union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#>
#> Attaching package: ‘S4Vectors’
#> The following object is masked from ‘package:utils’:
#>
#> findMatches
#> The following objects are masked from ‘package:base’:
#>
#> I, expand.grid, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: GenomicRanges
#> Loading required package: Biostrings
#> Loading required package: XVector
#>
#> Attaching package: ‘Biostrings’
#> The following object is masked from ‘package:base’:
#>
#> strsplit
#> Loading required package: BiocIO
#> Loading required package: rtracklayer
#>
#> Attaching package: ‘rtracklayer’
#> The following object is masked from ‘package:BiocIO’:
#>
#> FileForFormat
#> DNAStringSet object of length 17:
#> width seq names
#> [1] 30 TGTTCAAGCGCCTACGTGGGTTGGGCAACC tx1.3
#> [2] 60 TGTTCAAGCGCCTACGTGGGGAG...GTGACCTAGAGGTAATAAGGGT tx1.2
#> [3] 25 TAGGTGACCTAGAGGTAATAAGGGT tx1.1
#> [4] 40 CGCTTGAACACGTATAATGCAGTCTCGTCATCTCCCCCAG tx2.1-I
#> [5] 40 CCCACGTAGGCGCTTGAACAGACCACGCGGTCTCCCCCAG tx2.2-U
#> ... ... ...
#> [13] 25 CGCGGCGGCTAGTCTCGTCATCTCC tx2.2-U-I
#> [14] 25 GAACACGTATAATGCGGCAGGACCA tx2.2-U-I1
#> [15] 60 ATCCACAGTGATAACTTACTTGG...GTGTTGCCATTTTATCGGCTTG tx3.1-I
#> [16] 30 CCCGTTGTGCATCCACAGTGATAACTTACT tx3.2-I
#> [17] 30 ATAGAGGAGTGTTGCCATTTTATCGGCTTG tx3.2-I1