Generate a GRangesList object with feature ranges

Generate a GRangesList object with genomic ranges for (any combination of) spliced transcripts, unspliced transcripts and introns.

getFeatureRanges(
  gtf,
  featureType = c("spliced", "intron"),
  intronType = "separate",
  flankLength = 90L,
  joinOverlappingIntrons = FALSE,
  collapseIntronsByGene = FALSE,
  keepIntronInFeature = FALSE,
  verbose = TRUE
)

Arguments

gtf: Path to gtf file.
featureType: Character vector indicating the type(s) of features to extract, any subset of c("spliced", "intron", "unspliced").
intronType: Character vector indicating how to define the introns (only used if "intron" is part of featureType). Has to be either "separate" (introns are defined for each transcript separately) or "collapse" (transcripts of the same gene are first collapsed before introns are defined as any non-exonic part of the gene locus).
flankLength: Integer scalar indicating the length of the flanking sequence added to each side of each extracted intron (only used if "intron" is included in featureType).
joinOverlappingIntrons: Logical scalar indicating whether two introns that overlap (after adding the flanking sequences) should be joined into one feature.
collapseIntronsByGene: Logical scalar indicating whether to collapse overlapping intron ranges by gene after extraction.
keepIntronInFeature: Logical scalar indicating whether introns (after adding the flank length) should be restricted to stay within the boundaries of the feature they were generated from. For backwards compatibility, the default is FALSE.
verbose: Logical scalar, whether to print out progress messages.

Value

Returns a GRangesList object where each element represents one extracted feature. The metadata of this object contains two data.frames mapping corresponding identifiers between the different feature types, as well as a list of all features for each type.

Author

Charlotte Soneson

Examples

  ## Get feature ranges
  grl <- getFeatureRanges(
    gtf = system.file("extdata/small_example.gtf", package = "eisaR"),
    featureType = c("spliced", "intron"),
    intronType = "separate",
    flankLength = 5L,
    joinOverlappingIntrons = FALSE,
    collapseIntronsByGene = FALSE,
    verbose = TRUE
  )
#> Import genomic features from the file as a GRanges object ... 
#> OK
#> Prepare the 'metadata' data frame ... 
#> OK
#> Make the TxDb object ... 
#> OK
#> 'select()' returned 1:1 mapping between keys and columns
#> Extracting spliced transcript features
#> Extracting introns using the separate approach
  
  ## GRangesList
  grl
#> GRangesList object of length 17:
#> $tx1.3
#> GRanges object with 1 range and 5 metadata columns:
#>       seqnames    ranges strand |     exon_id exon_rank transcript_id
#>          <Rle> <IRanges>  <Rle> | <character> <integer>   <character>
#>   [1]     chr1     61-90      + |          E6         1         tx1.3
#>           gene_id        type
#>       <character> <character>
#>   [1]          g1        exon
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $tx1.2
#> GRanges object with 3 ranges and 5 metadata columns:
#>       seqnames    ranges strand |     exon_id exon_rank transcript_id
#>          <Rle> <IRanges>  <Rle> | <character> <integer>   <character>
#>   [1]     chr1     61-80      + |          E3         1         tx1.2
#>   [2]     chr1   101-120      + |          E4         2         tx1.2
#>   [3]     chr1   141-160      + |          E5         3         tx1.2
#>           gene_id        type
#>       <character> <character>
#>   [1]          g1        exon
#>   [2]          g1        exon
#>   [3]          g1        exon
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $tx1.1
#> GRanges object with 2 ranges and 5 metadata columns:
#>       seqnames    ranges strand |     exon_id exon_rank transcript_id
#>          <Rle> <IRanges>  <Rle> | <character> <integer>   <character>
#>   [1]     chr1   116-120      + |          E1         1         tx1.1
#>   [2]     chr1   141-160      + |          E2         2         tx1.1
#>           gene_id        type
#>       <character> <character>
#>   [1]          g1        exon
#>   [2]          g1        exon
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> ...
#> <14 more elements>
  
  ## Corresponding transcript/gene IDs
  S4Vectors::metadata(grl)$corrtx
#>   spliced
#> 1   tx1.3
#> 2   tx1.2
#> 3   tx1.1
#> 4 tx2.1-I
#> 5 tx2.2-U
#> 6   tx3.3
#> 7   tx3.1
#> 8   tx3.2
  S4Vectors::metadata(grl)$corrgene
#>   spliced intron
#> 1      g1   g1-I
#> 2    g2-I g2-I-I
#> 3    g3-U g3-U-I
  
  ## List of features of different types
  S4Vectors::metadata(grl)$featurelist
#> $spliced
#> [1] "tx1.3"   "tx1.2"   "tx1.1"   "tx2.1-I" "tx2.2-U" "tx3.3"   "tx3.1"  
#> [8] "tx3.2"  
#> 
#> $intron
#> [1] "tx1.2-I"    "tx1.2-I1"   "tx1.1-I"    "tx2.1-I-I"  "tx2.2-U-I" 
#> [6] "tx2.2-U-I1" "tx3.1-I"    "tx3.2-I"    "tx3.2-I1"  
#> 
  
  ## Get feature sequences
  if (requireNamespace("BSgenome", quietly = TRUE) &&
      requireNamespace("GenomicFeatures", quietly = TRUE)) {
    library(BSgenome)
    genome <- Biostrings::readDNAStringSet(
      system.file("extdata/small_example_genome.fa", package = "eisaR"))
    seqs <- GenomicFeatures::extractTranscriptSeqs(x = genome, 
                                                   transcripts = grl)
    seqs
  }
#> Loading required package: BiocGenerics
#> 
#> Attaching package: ‘BiocGenerics’
#> The following objects are masked from ‘package:stats’:
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from ‘package:base’:
#> 
#>     Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
#>     as.data.frame, basename, cbind, colnames, dirname, do.call,
#>     duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#>     lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#>     pmin.int, rank, rbind, rownames, sapply, setdiff, table, tapply,
#>     union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#> 
#> Attaching package: ‘S4Vectors’
#> The following object is masked from ‘package:utils’:
#> 
#>     findMatches
#> The following objects are masked from ‘package:base’:
#> 
#>     I, expand.grid, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: GenomicRanges
#> Loading required package: Biostrings
#> Loading required package: XVector
#> 
#> Attaching package: ‘Biostrings’
#> The following object is masked from ‘package:base’:
#> 
#>     strsplit
#> Loading required package: BiocIO
#> Loading required package: rtracklayer
#> 
#> Attaching package: ‘rtracklayer’
#> The following object is masked from ‘package:BiocIO’:
#> 
#>     FileForFormat
#> DNAStringSet object of length 17:
#>      width seq                                              names               
#>  [1]    30 TGTTCAAGCGCCTACGTGGGTTGGGCAACC                   tx1.3
#>  [2]    60 TGTTCAAGCGCCTACGTGGGGAG...GTGACCTAGAGGTAATAAGGGT tx1.2
#>  [3]    25 TAGGTGACCTAGAGGTAATAAGGGT                        tx1.1
#>  [4]    40 CGCTTGAACACGTATAATGCAGTCTCGTCATCTCCCCCAG         tx2.1-I
#>  [5]    40 CCCACGTAGGCGCTTGAACAGACCACGCGGTCTCCCCCAG         tx2.2-U
#>  ...   ... ...
#> [13]    25 CGCGGCGGCTAGTCTCGTCATCTCC                        tx2.2-U-I
#> [14]    25 GAACACGTATAATGCGGCAGGACCA                        tx2.2-U-I1
#> [15]    60 ATCCACAGTGATAACTTACTTGG...GTGTTGCCATTTTATCGGCTTG tx3.1-I
#> [16]    30 CCCGTTGTGCATCCACAGTGATAACTTACT                   tx3.2-I
#> [17]    30 ATAGAGGAGTGTTGCCATTTTATCGGCTTG                   tx3.2-I1