Repository: dzhang32/ggtranscript
Branch: master
Commit: 682a0df688ad
Files: 62
Total size: 227.7 KB
Directory structure:
gitextract_nqf_jr3r/
├── .Rbuildignore
├── .github/
│ ├── .gitignore
│ └── workflows/
│ └── check-bioc.yml
├── .gitignore
├── .pre-commit-config.yaml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│ ├── add_exon_number.R
│ ├── add_utr.R
│ ├── data.R
│ ├── geom_half_range.R
│ ├── geom_intron.R
│ ├── geom_junction.R
│ ├── geom_junction_label_repel.R
│ ├── geom_range.R
│ ├── ggtranscript-package.R
│ ├── globals.R
│ ├── shorten_gaps.R
│ ├── to_diff.R
│ ├── to_intron.R
│ └── utils.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── codecov.yml
├── data/
│ ├── pknox1_annotation.rda
│ ├── sod1_annotation.rda
│ └── sod1_junctions.rda
├── data-raw/
│ ├── ggplot2_exts_thumbnail.R
│ ├── ggtranscript_logo.R
│ ├── sod1_junctions.R
│ └── sod1_pknox1_annotation.R
├── inst/
│ └── CITATION
├── man/
│ ├── add_exon_number.Rd
│ ├── add_utr.Rd
│ ├── geom_intron.Rd
│ ├── geom_junction.Rd
│ ├── geom_junction_label_repel.Rd
│ ├── geom_range.Rd
│ ├── ggtranscript.Rd
│ ├── shorten_gaps.Rd
│ ├── sod1_annotation.Rd
│ ├── sod1_junctions.Rd
│ ├── to_diff.Rd
│ └── to_intron.Rd
├── tests/
│ ├── testthat/
│ │ ├── test-add_exon_number.R
│ │ ├── test-add_utr.R
│ │ ├── test-geom_half_range.R
│ │ ├── test-geom_intron.R
│ │ ├── test-geom_junction.R
│ │ ├── test-geom_junction_label_repel.R
│ │ ├── test-geom_range.R
│ │ ├── test-shorten_gaps.R
│ │ ├── test-to_diff.R
│ │ ├── test-to_intron.R
│ │ └── test-utils.R
│ └── testthat.R
└── vignettes/
├── .gitignore
└── ggtranscript.Rmd
================================================
FILE CONTENTS
================================================
================================================
FILE: .Rbuildignore
================================================
^ggtranscript\.Rproj$
^\.Rproj\.user$
^dev$
^README\.Rmd$
^\.github$
^codecov\.yml$
^.pre-commit-config.yaml$
^data-raw$
^_pkgdown.yml$
^tests/testthat/_snaps/*
^LICENSE\.md$
^cran-comments\.md$
================================================
FILE: .github/.gitignore
================================================
*.html
================================================
FILE: .github/workflows/check-bioc.yml
================================================
on:
push:
pull_request:
name: R-CMD-check-bioc
## These environment variables control whether to run GHA code later on that is
## specific to testthat, covr, and pkgdown.
env:
has_testthat: 'true'
run_covr: 'true'
run_pkgdown: 'true'
has_RUnit: 'false'
cache-version: 'cache-v1'
jobs:
build-check:
runs-on: ${{ matrix.config.os }}
name: ${{ matrix.config.os }} (${{ matrix.config.r }})
container: ${{ matrix.config.cont }}
strategy:
fail-fast: false
matrix:
config:
- { os: ubuntu-latest, r: '4.4', bioc: '3.19', cont: "bioconductor/bioconductor_docker:RELEASE_3_19", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
NOT_CRAN: true
TZ: UTC
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Set R Library home on Linux
if: runner.os == 'Linux'
run: |
mkdir /__w/_temp/Library
echo ".libPaths('/__w/_temp/Library')" > ~/.Rprofile
- name: Checkout Repository
uses: actions/checkout@v3
- name: Setup R from r-lib
if: runner.os != 'Linux'
uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
- name: Setup pandoc from r-lib
if: runner.os != 'Linux'
uses: r-lib/actions/setup-pandoc@v2
- name: Query dependencies
run: |
install.packages('remotes')
saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
shell: Rscript {0}
- name: Restore R package cache
if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'"
uses: actions/cache@v3
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-
- name: Cache R packages on Linux
if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
uses: actions/cache@v3
with:
path: /home/runner/work/_temp/Library
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-
- name: Install Linux system dependencies
if: runner.os == 'Linux'
run: |
sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))')
echo $sysreqs
sudo -s eval "$sysreqs"
- name: Install BiocManager
run: |
message(paste('****', Sys.time(), 'installing BiocManager ****'))
remotes::install_cran("BiocManager")
shell: Rscript {0}
- name: Set BiocVersion
run: |
BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE)
shell: Rscript {0}
- name: Install dependencies pass 1
run: |
## Try installing the package dependencies in steps. First the local
## dependencies, then any remaining dependencies to avoid the
## issues described at
## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html
## https://github.com/r-lib/remotes/issues/296
## Ideally, all dependencies should get installed in the first pass.
## Set the repos source depending on the OS
## Alternatively use https://storage.googleapis.com/bioconductor_docker/packages/
## though based on https://bit.ly/bioc2021-package-binaries
## the Azure link will be the main one going forward.
gha_repos <- if(
.Platform$OS.type == "unix" && Sys.info()["sysname"] != "Darwin"
) c(
"AnVIL" = "https://bioconductordocker.blob.core.windows.net/packages/3.14/bioc",
BiocManager::repositories()
) else BiocManager::repositories()
## For running the checks
message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****'))
install.packages(c("rcmdcheck", "BiocCheck"), repos = gha_repos)
## Pass #1 at installing dependencies
## This pass uses AnVIL-powered fast binaries
## details at https://github.com/nturaga/bioc2021-bioconductor-binaries
## The speed gains only apply to the docker builds.
message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
remotes::install_local(dependencies = TRUE, repos = gha_repos, build_vignettes = FALSE, upgrade = TRUE)
continue-on-error: true
shell: Rscript {0}
- name: Install dependencies pass 2
run: |
## Pass #2 at installing dependencies
## This pass does not use AnVIL and will thus update any packages
## that have seen been updated in Bioconductor
message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE)
shell: Rscript {0}
- name: Install BiocGenerics
if: env.has_RUnit == 'true'
run: |
## Install BiocGenerics
BiocManager::install("BiocGenerics")
shell: Rscript {0}
- name: Install covr
if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux'
run: |
remotes::install_cran("covr")
shell: Rscript {0}
- name: Install pkgdown
if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
run: |
remotes::install_cran("pkgdown")
shell: Rscript {0}
- name: Session info
run: |
options(width = 100)
pkgs <- installed.packages()[, "Package"]
sessioninfo::session_info(pkgs, include_base = TRUE)
shell: Rscript {0}
- name: Run CMD check
env:
_R_CHECK_CRAN_INCOMING_: false
DISPLAY: 99.0
run: |
options(crayon.enabled = TRUE)
rcmdcheck::rcmdcheck(
args = c("--no-manual", "--no-vignettes", "--timings"),
build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
error_on = "warning",
check_dir = "check"
)
shell: Rscript {0}
## Might need an to add this to the if: && runner.os == 'Linux'
- name: Reveal testthat details
if: env.has_testthat == 'true'
run: find . -name testthat.Rout -exec cat '{}' ';'
- name: Run RUnit tests
if: env.has_RUnit == 'true'
run: |
BiocGenerics:::testPackage()
shell: Rscript {0}
- name: Run BiocCheck
env:
DISPLAY: 99.0
run: |
BiocCheck::BiocCheck(
dir('check', 'tar.gz$', full.names = TRUE),
`quit-with-status` = FALSE,
`no-check-R-ver` = TRUE,
`no-check-bioc-help` = TRUE
)
shell: Rscript {0}
- name: Test coverage
if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux'
run: |
covr::codecov()
shell: Rscript {0}
- name: Install package
if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
run: R CMD INSTALL .
- name: Get R package info
if: runner.os == 'Linux'
run: |
#### DockerHub repos must be lowercase (,,) ####
name=$(grep '^Package:' DESCRIPTION | cut -d\ -f2)
echo "packageNameOrig=${name}" >> $GITHUB_ENV
echo $name
version=$(grep Version DESCRIPTION | grep -o "[0-9.]\+")
echo "packageVersion=${version}" >> $GITHUB_ENV
echo $version
shell: bash {0}
- name: Build and deploy pkgdown site
if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
run: |
git config --global --add safe.directory /__w/${{env.packageNameOrig}}/${{env.packageNameOrig}}
git config --local user.name "$GITHUB_ACTOR"
git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)"
shell: bash {0}
## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE)
## at least one locally before this will work. This creates the gh-pages
## branch (erasing anything you haven't version controlled!) and
## makes the git history recognizable by pkgdown.
- name: Upload check results
if: failure()
uses: actions/upload-artifact@master
with:
name: ${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-results
path: check
================================================
FILE: .gitignore
================================================
.Rproj.user
inst/doc
*.DS_Store
*.pdf
*.Rproj
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/lorenzwalthert/precommit
rev: v0.1.3.9133
hooks:
- id: readme-rmd-rendered # make sure README.Rmd is rendered to README.md
- id: parsable-R
exclude: >
(?x)^(
tests/testthat/in/style-files-fail-parse\.R|
tests/testthat/in/parsable-R-fail\.R|
)$
- id: style-files # style code in the tidyverse style
args: [--indent_by=4]
exclude: >
(?x)^(
tests/testthat/in/.*\.R|
renv/.*
)$
- id: deps-in-desc # all dependencies pkg::func are in listed in dec
args: [--allow_private_imports]
exclude: >
(?x)^(
tests/testthat/in/.*|
inst/renv-update\.R|
renv/activate.R|
vignettes/FAQ\.Rmd|
)$
- id: lintr
args: [--warn_only]
verbose: true
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: check-added-large-files # make sure no large files commited
- id: end-of-file-fixer
exclude: '\.Rd'
================================================
FILE: DESCRIPTION
================================================
Package: ggtranscript
Title: Visualizing Transcript Structure and Annotation using 'ggplot2'
Version: 1.0.0
Authors@R:
c(
person("David", "Zhang", , "dyzhang32@gmail.com",
role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-2382-8460")),
person("Emil", "Gustavsson", , "e.gustavsson@ucl.ac.uk", role = c("aut"),
comment = c(ORCID = "0000-0003-0541-7537")),
person("Regina", "Reynolds", , "regina.reynolds.16@ucl.ac.uk",
role = c("ctb"), comment = c(ORCID = "0000-0001-6470-7919")),
person("Sonia", "Ruiz", , "s.ruiz@ucl.ac.uk",
role = c("ctb"))
)
Description: The goal of ggtranscript is the simplify the process of visualizing
transcript structure and annotation. To achieve this, ggtranscript
introduces 5 new geoms (geom_range(), geom_half_range(), geom_intron(),
geom_junction() and geom_junction_label_repel()) as well as several helper
functions. As a 'ggplot2' extension, ggtranscript inherits 'ggplot2's
familiarity and flexibility, enabling users to intuitively adjust
aesthetics, parameters, scales etc as well as complement ggtranscript geoms
with existing 'ggplot2' geoms to create informative, publication-ready
plots.
License: MIT + file LICENSE
URL: https://github.com/dzhang32/ggtranscript
BugReports: https://github.com/dzhang32/ggtranscript/issues
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
Suggests:
BiocStyle,
covr,
ggpubr,
knitr,
rmarkdown,
rtracklayer,
sessioninfo,
testthat (>= 3.0.0),
vdiffr
Config/testthat/edition: 3
VignetteBuilder: knitr
Depends:
R (>= 2.10)
LazyData: true
Imports:
dplyr,
GenomicRanges,
ggplot2,
magrittr,
rlang,
S4Vectors,
GenomeInfoDb,
ggrepel
Collate:
'add_exon_number.R'
'add_utr.R'
'data.R'
'geom_range.R'
'geom_half_range.R'
'geom_intron.R'
'geom_junction.R'
'geom_junction_label_repel.R'
'ggtranscript-package.R'
'globals.R'
'shorten_gaps.R'
'to_diff.R'
'to_intron.R'
'utils.R'
================================================
FILE: LICENSE
================================================
YEAR: 2022
COPYRIGHT HOLDER: ggtranscript authors
================================================
FILE: LICENSE.md
================================================
# MIT License
Copyright (c) 2022 ggtranscript authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: NAMESPACE
================================================
# Generated by roxygen2: do not edit by hand
export(add_exon_number)
export(add_utr)
export(geom_half_range)
export(geom_intron)
export(geom_junction)
export(geom_junction_label_repel)
export(geom_range)
export(shorten_gaps)
export(to_diff)
export(to_intron)
import(ggrepel)
importFrom(ggplot2,aes)
importFrom(magrittr,"%>%")
importFrom(rlang,"%||%")
================================================
FILE: NEWS.md
================================================
# ggtranscript 1.0.0
## Fixes
* Updates `geom`s to work with the latest version of `ggplot2` and `ggrepel` (R version `4.4`).
* Fixes all unit tests.
* Fixes CI workflow issues and simplifies testing to linux-only.
# ggtranscript 0.99.9
## NEW FEATURES
* Address ggtranscript reviews; update docs with examples of using gtf/bed files, integration with `ggplot2` extensions and add usage of `shorten_gaps()` to README.
# ggtranscript 0.99.8
## NEW FEATURES
* Address CRAN feedback; changing ggplot2 -> 'ggplot2', remove biocViews and contributing, removing Date field in DESCRIPTION.
# ggtranscript 0.99.7
## NEW FEATURES
* Add `cran-comments.md` in preparation for first CRAN submission.
* Update CI to run `R CMD Check` on latest R version (4.2).
# ggtranscript 0.99.6
## NEW FEATURES
* Add `@return` documentation for `geom_*` functions for `BiocCheck`.
# ggtranscript 0.99.5
## NEW FEATURES
* Change branch to naming from main to master to match BBS.
# ggtranscript 0.99.4
## NEW FEATURES
* Change email to UCL email for Bioconductor submission.
## NEW FEATURES
* Add `add_utr()` for adding UTRs as ranges. This helper function is designed to
work with `shorten_gaps()`, enabling shortening of gaps whilst visually
differentiating UTRs from the CDS.
* Allow `to_intron()` to take CDS and UTRs ranges as input.
* Submit to Bioconductor.
# ggtranscript 0.99.2
## NEW FEATURES
* Add `geom_junction_label_repel()` for labeling junctions (e.g. with counts).
* Add `add_exon_number()` for visualizing the exon number/order.
# ggtranscript 0.99.1
## NEW FEATURES
* Implement base geoms: `geom_range()`, `geom_half_range()`, `geom_intron()`,
`geom_junction()` and helper functions: `to_intron()`, `to_diff()` and
`shorten_gaps()`.
================================================
FILE: R/add_exon_number.R
================================================
#' Add exon number
#'
#' `add_exon_number()` adds the exon number (the order the exons are transcribed
#' within each transcript) as a column in `exons`. This can be useful when
#' visualizing long, complex transcript structures, in order to keep track of
#' specific exons of interest.
#'
#' To note, a "strand" column must be present within `exons`. The strand is used
#' to differentiate whether exon numbers should be calculated according to
#' ascending ("+") or descending ("-") genomic co-ordinates. For ambiguous
#' strands ("*"), `add_exon_number()` will be assume the strand be "+".
#'
#' @inheritParams to_diff
#'
#' @return `data.frame()` equivalent to input `exons`, with the additional
#' column "exon_number".
#'
#' @export
#' @examples
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' # add the exon number for each transcript
#' sod1_exons <- sod1_exons %>% add_exon_number(group_var = "transcript_name")
#'
#' base <- sod1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = to_intron(sod1_exons, "transcript_name"),
#' strand = "+"
#' )
#'
#' # it can be useful to annotate exons with their exon number
#' # using ggplot2::geom_text()
#' base +
#' geom_text(aes(
#' x = (start + end) / 2, # plot label at midpoint of exon
#' label = exon_number
#' ),
#' size = 3.5,
#' nudge_y = 0.4
#' )
#'
#' # Or alternatively, using ggrepel::geom_label_repel()
#' # to separate labels from exons
#' base +
#' ggrepel::geom_label_repel(ggplot2::aes(
#' x = (start + end) / 2,
#' label = exon_number
#' ),
#' size = 3.5,
#' min.segment.length = 0
#' )
add_exon_number <- function(exons, group_var = NULL) {
.check_coord_object(exons, check_strand = TRUE)
.check_group_var(exons, group_var)
if (!is.null(group_var)) {
exons <- exons %>% dplyr::group_by_at(.vars = group_var)
}
# arrange to make sure order reflects genomic position
exons <- exons %>%
dplyr::arrange_at(c(.vars = c(group_var, "start", "end")))
# add exon number, assuming all plus strand at start
exons <- exons %>%
dplyr::mutate(
exon_number = dplyr::row_number(),
n_exons = dplyr::n()
) %>%
dplyr::ungroup()
# convert exon number for minus strand
exons <- exons %>%
dplyr::mutate(
exon_number = ifelse(
strand == "-",
n_exons - exon_number + 1,
exon_number
)
) %>%
dplyr::select(-n_exons)
return(exons)
}
================================================
FILE: R/add_utr.R
================================================
#' Add untranslated regions (UTRs)
#'
#' Given a set of `exons` (encompassing the CDS and UTRs) and `cds` regions,
#' `add_utr()` will calculate and add the corresponding UTR regions as ranges.
#' This can be useful when combined with `shorten_gaps()` to visualize
#' transcripts with long introns, whilst differentiating UTRs from CDS regions.
#'
#' The definition of the inputted `cds` regions are expected to range from the
#' beginning of the start codon to the end of the stop codon. Sometimes, for
#' example in the case of Ensembl, reference annotation will omit the stop
#' codons from the CDS definition. In such cases, users should manually ensure
#' that the `cds` includes both the start and stop codons.
#'
#' @inheritParams to_diff
#' @param cds `data.frame()` contains coding sequence ranges for the transcripts
#' in `exons`.
#'
#' @return `data.frame()` contains differentiated CDS and UTR ranges.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' pknox1_annotation %>% head()
#'
#' # extract exons
#' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
#' pknox1_exons %>% head()
#'
#' # extract cds
#' pknox1_cds <- pknox1_annotation %>% dplyr::filter(type == "CDS")
#' pknox1_cds %>% head()
#'
#' # the CDS definition originating from the Ensembl reference annotation
#' # does not include the stop codon
#' # we must incorporate the stop codons into the CDS manually
#' # by adding 3 base pairs to the end of the CDS of each transcript
#' pknox1_cds_w_stop <- pknox1_cds %>%
#' dplyr::group_by(transcript_name) %>%
#' dplyr::mutate(
#' end = ifelse(end == max(end), end + 3, end)
#' ) %>%
#' dplyr::ungroup()
#'
#' # add_utr() adds ranges that represent the UTRs
#' pknox1_cds_utr <- add_utr(
#' pknox1_exons,
#' pknox1_cds_w_stop,
#' group_var = "transcript_name"
#' )
#'
#' pknox1_cds_utr %>% head()
#'
#' # this can be useful when combined with shorten_gaps()
#' # to visualize transcripts with long introns whilst differentiating UTRs
#' pknox1_cds_utr_rescaled <-
#' shorten_gaps(
#' exons = pknox1_cds_utr,
#' introns = to_intron(pknox1_cds_utr, "transcript_name"),
#' group_var = "transcript_name"
#' )
#'
#' pknox1_cds_utr_rescaled %>%
#' dplyr::filter(type == "CDS") %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_range(
#' data = pknox1_cds_utr_rescaled %>% dplyr::filter(type == "UTR"),
#' height = 0.25,
#' fill = "white"
#' ) +
#' geom_intron(
#' data = to_intron(
#' pknox1_cds_utr_rescaled %>% dplyr::filter(type != "intron"),
#' "transcript_name"
#' ),
#' arrow.min.intron.length = 110
#' )
add_utr <- function(exons,
cds,
group_var = NULL) {
# input checks
.check_coord_object(exons, check_seqnames = TRUE)
.check_group_var(exons, group_var)
.check_coord_object(cds, check_seqnames = TRUE)
.check_group_var(cds, group_var)
# we have to create dummy group for downstream for loop if there is no group
null_group <- is.null(group_var)
if (null_group) {
exons <- exons %>% dplyr::mutate(dummy_group = "A")
cds <- cds %>% dplyr::mutate(dummy_group = "A")
group_var <- "dummy_group"
}
groups <- cds[[group_var]] %>% unique()
# convert to GenomicRanges for downstream processing
exons_gr <- exons %>% GenomicRanges::GRanges()
cds_gr <- cds %>% GenomicRanges::GRanges()
exons_w_utr <- vector("list", length = length(groups))
for (i in seq_along(groups)) {
exons_gr_curr <- exons_gr %>%
.[GenomicRanges::mcols(exons_gr)[[group_var]] == groups[i]]
cds_gr_curr <- cds_gr %>%
.[GenomicRanges::mcols(cds_gr)[[group_var]] == groups[i]]
# use setdiff to get regions in exon but not in cds (i.e. the utrs)
utrs_curr <- GenomicRanges::setdiff(exons_gr_curr, cds_gr_curr)
GenomicRanges::mcols(utrs_curr)[[group_var]] <- groups[i]
utrs_curr$type <- "UTR"
cds_gr_curr$type <- "CDS"
exons_w_utr[[i]] <- c(utrs_curr, cds_gr_curr) %>% sort()
}
exons_w_utr <- exons_w_utr %>%
do.call(c, .) %>%
as.data.frame() %>%
dplyr::as_tibble()
# remove dummp_group if created
if (null_group) {
exons_w_utr <- exons_w_utr %>% dplyr::select(-dummy_group)
}
return(exons_w_utr)
}
================================================
FILE: R/data.R
================================================
#' Example transcript annotation
#'
#' Transcript annotation including the co-ordinates (hg38) of the genes,
#' transcripts, exons and CDS regions for \emph{SOD1} and \emph{PKNOX1}, which
#' originate from version 105 of the Ensembl reference annotation.
#'
#' @format A `tibble::tibble()`:
#' \describe{
#' \item{seqnames}{`factor()` chromosome.}
#' \item{start}{`integer()` start position.}
#' \item{end}{`integer()` end position.}
#' \item{strand}{`factor()` strand.}
#' \item{type}{`factor()` E.g.gene, transcript, exon or CDS.}
#' \item{gene_name}{`character()` name of gene (GBA).}
#' \item{transcript_name}{`character()` name of transcript.}
#' \item{transcript_biotype}{`character()` biotype of transcript.}
#' }
#'
#' @source generated using `ggtranscript/data-raw/sod1_pknox1_annotation.R`
"sod1_annotation"
#' @rdname sod1_annotation
"pknox1_annotation"
#' Example junctions
#'
#' Junction co-ordinates and counts associated with the \emph{SOD1} gene.
#' Junctions counts originate from GTEx liver samples and are downloaded via the
#' Bioconductor package `snapcount`. Only unannotated junctions with a mean
#' count above 0.3 have been retained for this example.
#'
#' @format A `tibble::tibble()`:
#' \describe{
#' \item{seqnames}{`factor()` chromosome.}
#' \item{start}{`integer()` start position.}
#' \item{end}{`integer()` end position.}
#' \item{strand}{`factor()` strand.}
#' \item{mean_count}{`factor()` Average count across all GTEx liver samples.}
#' }
#'
#' @source generated using `ggtranscript/data-raw/sod1_junctions.R`
"sod1_junctions"
================================================
FILE: R/geom_half_range.R
================================================
#' @param range.orientation `character()` one of "top" or "bottom", specifying
#' where the half ranges will be plotted with respect to each transcript
#' (`y`).
#'
#' @export
#' @rdname geom_range
geom_half_range <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
...,
range.orientation = "bottom",
linejoin = "mitre",
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
ggplot2::layer(
data = data,
mapping = mapping,
stat = stat,
geom = GeomHalfRange,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
range.orientation = range.orientation,
linejoin = linejoin,
na.rm = na.rm,
...
)
)
}
#' `GeomHalfRange` is `GeomRange` with default parameters for `vjust` and
#' `height` as well as the added parameter `range.orientation`
#'
#' @include geom_range.R
#' @keywords internal
#' @noRd
GeomHalfRange <- ggplot2::ggproto("GeomHalfRange", GeomRange,
setup_data = function(data, params) {
# check that range.orientation is one of possible options
.check_range.orientation(params)
# modified from ggplot2::GeomTile
data$height <- data$height %||% params$height %||% 0.25
transform(
data,
xmin = xstart,
xmax = xend,
ymin = y - height / 2,
ymax = y + height / 2,
height = NULL
)
},
draw_panel = function(data,
panel_params,
coord,
range.orientation = "bottom",
lineend = "butt",
linejoin = "mitre") {
vjust <- ifelse(
range.orientation == "bottom",
1.5,
0.5
)
GeomRange$draw_panel(
data = data,
panel_params = panel_params,
coord = coord,
vjust = vjust,
lineend = lineend,
linejoin = linejoin
)
}
)
#' @keywords internal
#' @noRd
.check_range.orientation <- function(params) {
not_orient_option <-
!(params$range.orientation %in% c("top", "bottom"))
if (not_orient_option) {
stop(
"range.orientation must be one of ",
"'alternating', 'top' or 'bottom'"
)
}
}
================================================
FILE: R/geom_intron.R
================================================
#' Plot intron lines with strand arrows
#'
#' `geom_intron()` draws horizontal lines with central arrows that are designed
#' to represent introns. In combination with `geom_range()`/`geom_half_range()`,
#' these geoms form the core components for visualizing transcript structures.
#'
#' `geom_intron()` requires the following `aes()`; `xstart`, `xend` and `y`
#' (e.g. transcript name). If users do not have intron co-ordinates, these can
#' be generated from the corresponding exons using `to_intron()`. The `strand`
#' option (one of "+" or "-") adjusts the arrow direction to match the direction
#' of transcription. The `arrow.min.intron.length` parameter can be useful to
#' remove strand arrows that overlap exons, which can be a problem if plotted
#' introns include those that are relatively short.
#'
#' @inheritParams ggplot2::layer
#' @inheritParams ggplot2::geom_point
#' @inheritParams ggplot2::geom_segment
#' @param arrow.min.intron.length `integer()` the minimum required width of an
#' intron for a strand arrow to be drawn. This can be useful to remove strand
#' arrows on short introns that overlap adjacent exons.
#'
#' @return the return value of a `geom_*` function is not intended to be
#' directly handled by users. Therefore, `geom_*` functions should never be
#' executed in isolation, rather used in combination with a
#' `ggplot2::ggplot()` call.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' pknox1_annotation %>% head()
#'
#' # extract exons
#' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
#' pknox1_exons %>% head()
#'
#' # to_intron() is a helper function included in ggtranscript
#' # which is useful for converting exon co-ordinates to introns
#' pknox1_introns <- pknox1_exons %>% to_intron(group_var = "transcript_name")
#' pknox1_introns %>% head()
#'
#' base <- pknox1_introns %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' ))
#'
#' # by default, geom_intron() assumes introns originate from the "+" strand
#' base + geom_intron()
#'
#' # however this can be modified using the strand option
#' base + geom_intron(strand = "-")
#'
#' # strand can also be set as an aes()
#' base + geom_intron(aes(strand = strand))
#'
#' # as a ggplot2 extension, ggtranscript geoms inherit the
#' # the functionality from the parameters and aesthetics in ggplot2
#' base + geom_intron(
#' aes(colour = transcript_name),
#' linewidth = 1
#' )
#'
#' # together, geom_range() and geom_intron() are designed to visualize
#' # the core components of transcript annotation
#' pknox1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = pknox1_introns
#' )
#'
#' # for short introns, sometimes strand arrows will overlap exons
#' # to avoid this, users can set the arrow.min.intron.length parameter
#' pknox1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = pknox1_introns,
#' arrow.min.intron.length = 3500
#' )
geom_intron <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
...,
arrow = grid::arrow(ends = "last", length = grid::unit(0.1, "inches")),
arrow.fill = NULL,
lineend = "butt",
linejoin = "round",
na.rm = FALSE,
arrow.min.intron.length = 0,
show.legend = NA,
inherit.aes = TRUE) {
ggplot2::layer(
data = data,
mapping = mapping,
stat = stat,
geom = GeomIntron,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
arrow = arrow,
arrow.fill = arrow.fill,
lineend = lineend,
linejoin = linejoin,
na.rm = na.rm,
arrow.min.intron.length = arrow.min.intron.length,
...
)
)
}
#' `GeomIntron` is pretty much `ggplot2::GeomSegment` with the `required_aes`
#' changed to `xstart`/`xend` to match genetic nomenclature and the added arrows
#' to indicate direction of transcription (configured with `strand` and
#' `arrow.min.intron.length`)
#' @noRd
GeomIntron <- ggplot2::ggproto("GeomIntron", ggplot2::GeomSegment,
required_aes = c("xstart", "xend", "y"),
default_aes = aes(
colour = "black",
linewidth = 0.5,
linetype = 1,
alpha = NA,
strand = "+"
),
setup_params = function(data, params) {
# check that arrow.min.intron.length numeric is >= 0
arrow.min_numeric <- is.numeric(params$arrow.min.intron.length)
arrow.min_neg <- params$arrow.min.intron.length < 0
if (!arrow.min_numeric | arrow.min_neg) {
stop("arrow.min.intron.length must be a numeric > 0")
}
params
},
setup_data = function(data, params) {
# needed to permit usage of xstart/xend
transform(
data,
x = xstart,
yend = y,
xstart = NULL
)
},
draw_panel = function(data,
panel_params,
coord,
arrow = NULL,
arrow.fill = NULL,
lineend = "butt",
linejoin = "round",
na.rm = FALSE,
arrow.min.intron.length = 0) {
# check that strand is scalar and one of "+" or "-"
.check_strand(data$strand)
# first, create the intron grob, which is just a pure line (no arrow)
intron_grob <- ggplot2::GeomSegment$draw_panel(
data = data,
panel_params = panel_params,
coord = coord,
arrow = NULL,
arrow.fill = NULL,
lineend = lineend,
linejoin = linejoin,
na.rm = na.rm
)
# then, create the arrow grobs, one per strand
# need both as the direction of arrow (as far I can tell) is
# is dependent on the orientation of the x/xend
strand_arrow_plus_grob <- .create_strand_arrow_grob(
target_strand = "+",
arrow.min.intron.length = arrow.min.intron.length,
data = data,
panel_params = panel_params,
coord = coord,
arrow = arrow,
arrow.fill = arrow.fill,
lineend = lineend,
linejoin = linejoin,
na.rm = na.rm
)
strand_arrow_minus_grob <- .create_strand_arrow_grob(
target_strand = "-",
arrow.min.intron.length = arrow.min.intron.length,
data = data,
panel_params = panel_params,
coord = coord,
arrow = arrow,
arrow.fill = arrow.fill,
lineend = lineend,
linejoin = linejoin,
na.rm = na.rm
)
# draw_panel expects return of a grob
# here, as we build multiple grobs (i.e. intron lines + arrows)
# we use a grobTree to combine the two
grid::grobTree(
intron_grob,
strand_arrow_plus_grob,
strand_arrow_minus_grob
)
}
)
#' @keywords internal
#' @noRd
.check_strand <- function(strand) {
# TODO - add option for "*" arrow?
any_na <- any(is.na(strand))
plus_minus <- !(all(strand %in% c("+", "-")))
if (any_na | plus_minus) {
stop("strand values must be one of '+' and '-'")
}
return(invisible())
}
#' @keywords internal
#' @noRd
.create_strand_arrow_grob <- function(target_strand,
arrow.min.intron.length,
data,
panel_params,
coord,
arrow,
arrow.fill,
lineend,
linejoin,
na.rm) {
# filter for introns that match target strand
# and have a length above arrow.min.intron.length
match_strand <- data$strand == target_strand
ab_min <- abs(data$x - data$xend) > arrow.min.intron.length
arrow_data <- data[match_strand & ab_min, ]
# if there are no arrows to plot, use a nullGrob() to add nothing
if (nrow(arrow_data) == 0) {
arrow_grob <- grid::nullGrob()
} else {
# obtain the the correct orientation of arrow (dependent on strand)
# as the arrow can only be placed at either end of a geom_segment/path
# the strand changes the x/xends around, shifting the around direction
if (target_strand == "+") {
arrow_data <- transform(
arrow_data,
xend = (x + xend) / 2
)
} else {
arrow_data <- transform(
arrow_data,
mid = (x + xend) / 2,
x = xend
)
arrow_data <- transform(
arrow_data,
xend = mid
)
}
arrow_grob <- ggplot2::GeomSegment$draw_panel(
data = arrow_data,
panel_params = panel_params,
coord = coord,
arrow = arrow,
arrow.fill = arrow.fill,
lineend = lineend,
linejoin = linejoin,
na.rm = na.rm
)
}
return(arrow_grob)
}
================================================
FILE: R/geom_junction.R
================================================
#' Plot junction curves
#'
#' `geom_junction()` draws curves that are designed to represent junction reads
#' from RNA-sequencing data. It can be useful to overlay junction data on
#' transcript annotation (plotted using `geom_range()`/`geom_half_range()` and
#' `geom_intron()`) to understand which splicing events or transcripts have
#' support from RNA-sequencing data.
#'
#' `geom_junction()` requires the following `aes()`; `xstart`, `xend` and `y`
#' (e.g. transcript name). `geom_junction()` curves can be modified using
#' `junction.y.max`, which can be useful when junctions overlap one
#' another/other transcripts or extend beyond the plot margins. By default,
#' junction curves will alternate between being plotted on the top and bottom of
#' each transcript (`y`), however this can be modified via
#' `junction.orientation`.
#'
#' @inheritParams ggplot2::layer
#' @inheritParams ggplot2::geom_bar
#' @inheritParams grid::curveGrob
#' @param junction.orientation `character()` one of "alternating", "top" or
#' "bottom", specifying where the junctions will be plotted with respect to
#' each transcript (`y`).
#' @param junction.y.max `double()` the max y-value of each junction curve. It
#' can be useful to adjust this parameter when junction curves overlap with
#' one another/other transcripts or extend beyond the plot margins.
#'
#' @return the return value of a `geom_*` function is not intended to be
#' directly handled by users. Therefore, `geom_*` functions should never be
#' executed in isolation, rather used in combination with a
#' `ggplot2::ggplot()` call.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # as well as a set of example (unannotated) junctions
#' # originating from GTEx and downloaded via the Bioconductor package snapcount
#' sod1_junctions
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(
#' type == "exon",
#' transcript_name == "SOD1-201"
#' )
#' sod1_exons %>% head()
#'
#' # add transcript_name to junctions for plotting
#' sod1_junctions <- sod1_junctions %>%
#' dplyr::mutate(transcript_name = "SOD1-201")
#'
#' # junctions can be plotted as curves using geom_junction()
#' base <- sod1_junctions %>%
#' ggplot2::ggplot(ggplot2::aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' ))
#'
#' # sometimes, depending on the number and widths of transcripts and junctions
#' # junctions will go overlap one another or extend beyond the plot margin
#' base + geom_junction()
#'
#' # in such cases, junction.y.max can be adjusted to modify the max y of curves
#' base + geom_junction(junction.y.max = 0.5)
#'
#' # ncp can be used improve the smoothness of curves
#' base + geom_junction(junction.y.max = 0.5, ncp = 30)
#'
#' # junction.orientation controls where the junction are plotted
#' # with respect to each transcript
#' # either alternating (default), or on the top or bottom
#' base + geom_junction(junction.orientation = "top", junction.y.max = 0.5)
#' base + geom_junction(junction.orientation = "bottom", junction.y.max = 0.5)
#'
#' # it can be useful useful to overlay junction curves onto existing annotation
#' # plotted using geom_range() and geom_intron()
#' base <- sod1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = to_intron(sod1_exons, "transcript_name")
#' )
#'
#' base + geom_junction(
#' data = sod1_junctions,
#' junction.y.max = 0.5
#' )
#'
#' # as a ggplot2 extension, ggtranscript geoms inherit the
#' # the functionality from the parameters and aesthetics in ggplot2
#' # this can be useful when mapping junction thickness to their counts
#' base + geom_junction(
#' data = sod1_junctions,
#' aes(linewidth = mean_count),
#' junction.y.max = 0.5,
#' colour = "purple"
#' ) +
#' scale_linewidth(range = c(0.1, 1))
#'
#' # it can be useful to combine geom_junction() with geom_half_range()
#' sod1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_half_range() +
#' geom_intron(
#' data = to_intron(sod1_exons, "transcript_name")
#' ) +
#' geom_junction(
#' data = sod1_junctions,
#' aes(linewidth = mean_count),
#' junction.y.max = 0.5,
#' junction.orientation = "top",
#' colour = "purple"
#' ) +
#' scale_linewidth(range = c(0.1, 1))
geom_junction <- function(mapping = NULL,
data = NULL,
stat = "identity",
position = "identity",
junction.orientation = "alternating",
junction.y.max = 1,
angle = 90,
ncp = 15,
na.rm = FALSE,
orientation = NA,
show.legend = NA,
inherit.aes = TRUE,
...) {
ggplot2::layer(
data = data,
mapping = mapping,
stat = stat,
geom = GeomJunction,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
junction.orientation = junction.orientation,
junction.y.max = junction.y.max,
angle = angle,
ncp = ncp,
na.rm = na.rm,
orientation = orientation,
...
)
)
}
#' @keywords internal
#' @noRd
GeomJunction <- ggplot2::ggproto("GeomJunction", ggplot2::GeomLine,
required_aes = c("xstart", "xend", "y"),
setup_data = function(data, params) {
# check that junction.orientation is length 1 + one of possible options
.check_junction.orientation(params)
# check that junction.y.max is length 1 + one of possible options
.check_junction.y.max(params)
# we need a unique group id per junction, rather than per transcript
# similar to spring example from ggplot2 book
# https://ggplot2-book.org/spring1.html#spring3
if (is.null(data$group)) {
data$group <- seq_len(nrow(data))
}
if (anyDuplicated(data$group)) {
data$group <- paste(data$group, seq_len(nrow(data)), sep = "-")
}
# needed to permit usage of xstart/xend
transform(
data,
x = xstart,
xstart = NULL
)
},
draw_panel = function(data,
panel_params,
coord,
junction.orientation = "alternating",
junction.y.max = 1,
angle = 90,
ncp = 15) {
# junction_index represents the order of each junction within tx
# needed for junction.orientation = "alternating"
data <- data %>%
dplyr::group_by(y) %>%
dplyr::mutate(junction_index = dplyr::row_number()) %>%
dplyr::ungroup()
# obtain the actual curves using grid:::calcControlPoints
junctions <- .get_junction_curves(data, angle, ncp)
# normalise curve points to lie between 0-1
# scale to fit depending on N txs, width of junctions
junctions <- .get_normalised_curve(
junctions,
junction.orientation,
junction.y.max
)
ggplot2::GeomLine$draw_panel(junctions, panel_params, coord)
}
)
#' @keywords internal
#' @noRd
.get_junction_curves <- function(data, angle, ncp) {
# very similar to springs example
# create the junction points, whilst preserving aes
# https://ggplot2-book.org/spring1.html#spring3
# TODO - implementation could probably be vectorised for speed
cols_to_keep <- setdiff(names(data), c("x", "xend", "y"))
junctions <- lapply(seq_len(nrow(data)), function(i) {
junction_curve <- .get_junction_curve(
data$x[i], data$xend[i], data$y[i],
angle, ncp
)
cbind(junction_curve, unclass(data[i, cols_to_keep]))
})
junctions <- do.call(rbind, junctions)
return(junctions)
}
#' @keywords internal
#' @noRd
.get_junction_curve <- function(x, xend, y, angle, ncp) {
# creates the points for each curve
curve_points <- calcControlPoints(
x1 = x, x2 = xend,
y1 = y, y2 = y,
angle = angle,
curvature = -0.5,
ncp = ncp
)
# need to re-add the original points as these not included
# by grid:::calcControlPoints
# makes sure junctions curves meet the intron lines
junction_curve <- data.frame(
x_points = c(x, curve_points$x, xend),
y_points = c(y, curve_points$y, y),
y_original = y
) %>%
dplyr::rename(
x = x_points,
y = y_points
)
return(junction_curve)
}
#' @keywords internal
#' @noRd
.get_normalised_curve <- function(junctions,
junction.orientation,
junction.y.max) {
# junction.y.max is equivalent to the max y of each junction curve
# each tx is internally uses y an integer
# scaling factor (sf) is used normalise the junction curve points
sf <- 1 / junction.y.max
# each curve point is normalised with relation to the original tx y
# first divided by the max(y), meaning all y values lie between 0-1
# then divided by the sf, setting the max y
if (junction.orientation == "top") {
junctions <- junctions %>% dplyr::mutate(
y = ifelse(y == y_original, y, y_original + (y / max(y)) / sf)
)
} else if (junction.orientation == "bottom") {
junctions <- junctions %>% dplyr::mutate(
y = ifelse(y == y_original, y, y_original - (y / max(y)) / sf)
)
} else if (junction.orientation == "alternating") {
junctions <- junctions %>% dplyr::mutate(y = dplyr::case_when(
y == y_original ~ y,
junction_index %% 2 == 0 ~ y_original - (y / max(y) / sf),
junction_index %% 2 == 1 ~ y_original + (y / max(y) / sf)
))
}
return(junctions)
}
#' @keywords internal
#' @noRd
.check_junction.orientation <- function(params) {
not_orient_option <-
!(params$junction.orientation %in% c("alternating", "top", "bottom"))
if (not_orient_option) {
stop(
"junction.orientation must be one of ",
"'alternating', 'top' or 'bottom'"
)
}
}
#' @keywords internal
#' @noRd
.check_junction.y.max <- function(params) {
if (length(params$junction.y.max) != 1) {
stop(
"junction.y.max must have a length of 1"
)
}
if (!is.numeric(params$junction.y.max)) {
stop(
"junction.y.max must be a numeric value (integer/double)"
)
}
}
calcControlPoints <- grid:::calcControlPoints
================================================
FILE: R/geom_junction_label_repel.R
================================================
#' Label junction curves
#'
#' `geom_junction_label_repel()` labels junction curves at their midpoint using
#' `ggrepel::geom_label_repel()`. This can be useful to label and compare
#' junctions (plotted using `geom_junction()`) with metrics of their usage (e.g.
#' read counts or percent-spliced-in).
#'
#' `geom_junction_label_repel()` requires the following `aes()`; `xstart`,
#' `xend`, `y` (e.g. transcript name) and `label`. Under the hood,
#' `geom_junction_label_repel()` generates the same junction curves as
#' `geom_junction()` to obtain curve midpoints for labeling. Therefore, it is
#' important that users use the same input data and parameters that alter
#' junction curves (namely `junction.orientation`, `junction.y.max`, `angle`,
#' `ncp`) for `geom_junction_label_repel()` that they have used for
#' `geom_junction()`.
#'
#' @inheritParams ggrepel::geom_text_repel
#' @inheritParams grid::curveGrob
#' @inheritParams geom_junction
#'
#' @return the return value of a `geom_*` function is not intended to be
#' directly handled by users. Therefore, `geom_*` functions should never be
#' executed in isolation, rather used in combination with a
#' `ggplot2::ggplot()` call.
#'
#' @export
#' @examples
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # as well as a set of example (unannotated) junctions
#' # originating from GTEx and downloaded via the Bioconductor package snapcount
#' sod1_junctions
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(
#' type == "exon",
#' transcript_name == "SOD1-201"
#' )
#' sod1_exons %>% head()
#'
#' # add transcript_name to junctions for plotting
#' sod1_junctions <- sod1_junctions %>%
#' dplyr::mutate(transcript_name = "SOD1-201")
#'
#' # geom_junction_label_repel() can be used to label junctions
#' base <- sod1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = to_intron(sod1_exons, "transcript_name")
#' )
#'
#' # this can be useful to label junctions with their counts
#' base +
#' geom_junction(
#' data = sod1_junctions,
#' junction.y.max = 0.5
#' ) +
#' geom_junction_label_repel(
#' data = sod1_junctions,
#' aes(label = round(mean_count, 2)),
#' junction.y.max = 0.5
#' )
geom_junction_label_repel <- function(mapping = NULL,
data = NULL,
stat = "identity",
position = "identity",
parse = FALSE,
...,
junction.orientation = "alternating",
junction.y.max = 1,
angle = 90,
ncp = 15,
box.padding = 0.25,
label.padding = 0.25,
point.padding = 1e-6,
label.r = 0.15,
label.size = 0.25,
min.segment.length = 0,
arrow = NULL,
force = 1,
force_pull = 1,
max.time = 0.5,
max.iter = 10000,
max.overlaps = getOption("ggrepel.max.overlaps", default = 10),
nudge_x = 0,
nudge_y = 0,
xlim = c(NA, NA),
ylim = c(NA, NA),
na.rm = FALSE,
show.legend = NA,
direction = c("both", "y", "x"),
seed = NA,
verbose = FALSE,
inherit.aes = TRUE) {
if (!missing(nudge_x) || !missing(nudge_y)) {
if (!missing(position)) {
stop("Specify either `position` or `nudge_x`/`nudge_y`", call. = FALSE)
}
position <- position_nudge_repel(nudge_x, nudge_y)
}
ggplot2::layer(
data = data,
mapping = mapping,
stat = stat,
geom = GeomJunctionLabelRepel,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
parse = parse,
junction.orientation = junction.orientation,
junction.y.max = junction.y.max,
angle = angle,
ncp = ncp,
box.padding = to_unit(box.padding),
label.padding = to_unit(label.padding),
point.padding = to_unit(point.padding),
label.r = to_unit(label.r),
label.size = label.size,
min.segment.length = to_unit(min.segment.length),
arrow = arrow,
na.rm = na.rm,
force = force,
force_pull = force_pull,
max.time = max.time,
max.iter = max.iter,
max.overlaps = max.overlaps,
nudge_x = nudge_x,
nudge_y = nudge_y,
xlim = xlim,
ylim = ylim,
direction = match.arg(direction),
seed = seed,
verbose = verbose,
...
)
)
}
#' @include geom_junction.R
#' @keywords internal
#' @noRd
GeomJunctionLabelRepel <- ggplot2::ggproto(
"GeomJunctionLabelRepel", ggrepel::GeomLabelRepel,
required_aes = c("xstart", "xend", "y", "label"),
# copied from ggrepel::GeomLabelRepel with segment.colour and segment.alpha
# defaults set to appropriate values, rather than NULL
# this avoid warnings e.g. Unknown or uninitialised column: `segment.alpha`
# but does cause issues when setting e.g. aes(colour = tx)
# TODO - resolve either warning or make segment.colour borrow colour aes
default_aes = aes(
colour = "black",
fill = "white",
size = 3.88,
angle = 0,
alpha = NA,
family = "",
fontface = 1,
lineheight = 1.2,
hjust = 0.5,
vjust = 0.5,
point.size = 1,
segment.linetype = 1,
segment.colour = "black",
segment.size = 0.5,
segment.alpha = NA,
segment.curvature = 0,
segment.angle = 90,
segment.ncp = 1,
segment.shape = 0.5,
segment.square = TRUE,
segment.squareShape = 1,
segment.inflect = FALSE,
segment.debug = FALSE
),
setup_data = GeomJunction$setup_data,
draw_panel = function(data, panel_scales, coord,
parse = FALSE,
na.rm = FALSE,
junction.orientation = "alternating",
junction.y.max = 1,
angle = 90,
ncp = 15,
box.padding = 0.25,
label.padding = 0.25,
point.padding = 1e-6,
label.r = 0.15,
label.size = 0.25,
min.segment.length = 0,
arrow = NULL,
force = 1,
force_pull = 1,
max.time = 0.5,
max.iter = 10000,
max.overlaps = 10,
nudge_x = 0,
nudge_y = 0,
xlim = c(NA, NA),
ylim = c(NA, NA),
direction = "both",
seed = NA,
verbose = FALSE) {
# junction_index represents the order of each junction within tx
# needed for junction.orientation = "alternating"
data <- data %>%
dplyr::group_by(y) %>%
dplyr::mutate(junction_index = dplyr::row_number()) %>%
dplyr::ungroup()
# obtain the midpoints of junction curves (where we want label)
junction_midpoints <-
to_junction_midpoints(
data,
angle,
ncp,
junction.orientation,
junction.y.max
)
ggrepel::GeomLabelRepel$draw_panel(
data = junction_midpoints,
panel_scales = panel_scales,
coord = coord,
parse = parse,
na.rm = na.rm,
box.padding = box.padding,
label.padding = label.padding,
point.padding = point.padding,
label.r = label.r,
label.size = label.size,
min.segment.length = min.segment.length,
arrow = arrow,
force = force,
force_pull = force_pull,
max.time = max.time,
max.iter = max.iter,
max.overlaps = max.overlaps,
nudge_x = nudge_x,
nudge_y = nudge_y,
xlim = xlim,
ylim = ylim,
direction = direction,
seed = seed,
verbose = verbose
)
}
)
#' Wrapper for obtaining junction curve midpoints
#'
#' @keywords internal
#' @noRd
to_junction_midpoints <- function(data,
angle,
ncp,
junction.orientation,
junction.y.max) {
# TODO - maybe export this as helper?
junctions <- .get_junction_curves(data, angle, ncp)
junctions <- .get_normalised_curve(
junctions,
junction.orientation,
junction.y.max
)
junction_midpoints <- .get_curve_midpoints(junctions)
return(junction_midpoints)
}
#' @keywords internal
#' @noRd
.get_curve_midpoints <- function(junctions) {
# get the mid points of each curve for labeling junctions
# these are the points with the x value closest to median(x)
# this cannot be == median(x), this will not pick up point for even ncp's
junctions_mid <- junctions %>%
dplyr::group_by(group) %>%
dplyr::mutate(
median_x = stats::median(x),
median_diff = abs(x - median_x)
) %>%
dplyr::filter(median_diff == min(median_diff)) %>%
dplyr::ungroup() %>%
dplyr::select(-median_x, -median_diff)
return(junctions_mid)
}
to_unit <- ggrepel:::to_unit
================================================
FILE: R/geom_range.R
================================================
#' Plot genomic ranges
#'
#' `geom_range()` and `geom_half_range()` draw tiles that are designed to
#' represent range-based genomic features, such as exons. In combination with
#' `geom_intron()`, these geoms form the core components for visualizing
#' transcript structures.
#'
#' `geom_range()` and `geom_half_range()` require the following `aes()`;
#' `xstart`, `xend` and `y` (e.g. transcript name). `geom_half_range()` takes
#' advantage of the vertical symmetry of transcript annotation by plotting only
#' half of a range on the top or bottom of a transcript structure. This can be
#' useful for comparing between two transcripts or free up plotting space for
#' other transcript annotations (e.g. `geom_junction()`).
#'
#' @inheritParams ggplot2::layer
#' @inheritParams ggplot2::geom_point
#' @inheritParams ggplot2::geom_tile
#' @inheritParams ggplot2::geom_segment
#' @inheritParams grid::rectGrob
#'
#' @return the return value of a `geom_*` function is not intended to be
#' directly handled by users. Therefore, `geom_*` functions should never be
#' executed in isolation, rather used in combination with a
#' `ggplot2::ggplot()` call.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' base <- sod1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' ))
#'
#' # geom_range() is designed to visualise range-based annotation such as exons
#' base + geom_range()
#'
#' # geom_half_range() allows users to plot half ranges
#' # on the top or bottom of the transcript
#' base + geom_half_range()
#'
#' # where the half ranges are plotted can be adjusted using range.orientation
#' base + geom_half_range(range.orientation = "top")
#'
#' # as a ggplot2 extension, ggtranscript geoms inherit the
#' # the functionality from the parameters and aesthetics in ggplot2
#' base + geom_range(
#' aes(fill = transcript_name),
#' linewidth = 1
#' )
#'
#' # together, geom_range() and geom_intron() are designed to visualize
#' # the core components of transcript annotation
#' base + geom_range(
#' aes(fill = transcript_biotype)
#' ) +
#' geom_intron(
#' data = to_intron(sod1_exons, "transcript_name")
#' )
#'
#' # for protein coding transcripts
#' # geom_range() be useful for visualizing UTRs that lie outside of the CDS
#' sod1_exons_prot_coding <- sod1_exons %>%
#' dplyr::filter(transcript_biotype == "protein_coding")
#'
#' # extract cds
#' sod1_cds <- sod1_annotation %>%
#' dplyr::filter(type == "CDS")
#'
#' sod1_exons_prot_coding %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range(
#' fill = "white",
#' height = 0.25
#' ) +
#' geom_range(
#' data = sod1_cds
#' ) +
#' geom_intron(
#' data = to_intron(sod1_exons_prot_coding, "transcript_name")
#' )
#'
#' # geom_half_range() can be useful for comparing between two transcripts
#' # enabling visualization of one transcript on the top, other on the bottom
#' sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
#' sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201")
#' sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202")
#' sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202")
#'
#' sod1_201_plot <- sod1_201_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = "SOD1-201/202"
#' )) +
#' geom_half_range(
#' fill = "white",
#' height = 0.125
#' ) +
#' geom_half_range(
#' data = sod1_201_cds
#' ) +
#' geom_intron(
#' data = to_intron(sod1_201_exons, "transcript_name")
#' )
#'
#' sod1_201_plot
#'
#' sod1_201_202_plot <- sod1_201_plot +
#' geom_half_range(
#' data = sod1_202_exons,
#' range.orientation = "top",
#' fill = "white",
#' height = 0.125
#' ) +
#' geom_half_range(
#' data = sod1_202_cds,
#' range.orientation = "top",
#' fill = "purple"
#' ) +
#' geom_intron(
#' data = to_intron(sod1_202_exons, "transcript_name")
#' )
#'
#' sod1_201_202_plot
#'
#' # leveraging existing ggplot2 functionality via e.g. coord_cartesian()
#' # can be useful to zoom in on areas of interest
#' sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000))
geom_range <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
...,
vjust = NULL,
linejoin = "mitre",
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
ggplot2::layer(
data = data,
mapping = mapping,
stat = stat,
geom = GeomRange,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
vjust = vjust,
linejoin = linejoin,
na.rm = na.rm,
...
)
)
}
#' `GeomRange` is `ggplot2::GeomTile` with modified `aes` to match genetic
#' nomenclature (`xstart`/`xend`)
#' @keywords internal
#' @noRd
GeomRange <- ggplot2::ggproto("GeomRange", ggplot2::GeomTile,
required_aes = c("xstart", "xend", "y"),
default_aes = aes(
fill = "grey",
colour = "black",
linewidth = 0.25,
linetype = 1,
alpha = NA,
height = NA
),
setup_data = function(data, params) {
# modified from ggplot2::GeomTile
data$height <- data$height %||% params$height %||% 0.5
transform(
data,
xmin = xstart,
xmax = xend,
ymin = y - height / 2,
ymax = y + height / 2,
height = NULL
)
},
draw_panel = function(self,
data,
panel_params,
coord,
vjust = NULL,
lineend = "butt",
linejoin = "mitre") {
if (!coord$is_linear()) {
# prefer to match geom_curve and warn
# rather than copy the implementation from GeomRect for simplicity
# also don'think geom_range would be used for non-linear coords
warn("geom_ is not implemented for non-linear coordinates")
}
coords <- coord$transform(data, panel_params)
grid::rectGrob(
coords$xmin, coords$ymax,
width = coords$xmax - coords$xmin,
height = coords$ymax - coords$ymin,
default.units = "native",
just = c("left", "top"),
vjust = vjust,
gp = grid::gpar(
col = coords$colour,
fill = ggplot2::alpha(coords$fill, coords$alpha),
lwd = coords$linewidth * ggplot2::.pt,
lty = coords$linetype,
linejoin = linejoin,
lineend = lineend
)
)
}
)
================================================
FILE: R/ggtranscript-package.R
================================================
#' `ggtranscript`: Visualizing transcript structure and annotation using
#' `ggplot2`
#'
#' The goal of `ggtranscript` is the simplify the process of visualizing
#' transcript structure and annotation. To achieve this, `ggtranscript`
#' introduces 5 new geoms (`geom_range()`, `geom_half_range()`, `geom_intron()`,
#' `geom_junction()` and `geom_junction_label_repel()`) as well as several
#' helper functions. As a `ggplot2` extension, `ggtranscript` inherits
#' `ggplot2`'s familiarity and flexibility, enabling users to intuitively adjust
#' aesthetics, parameters, scales etc as well as complement `ggtranscript` geoms
#' with existing `ggplot2` geoms to create informative, publication-ready plots.
#'
#' @docType package
#' @name ggtranscript
"_PACKAGE"
#' @importFrom rlang %||%
#' @importFrom magrittr %>%
#' @importFrom ggplot2 aes
#' @import ggrepel
NULL
================================================
FILE: R/globals.R
================================================
# bypass R CMD Check notes, related to tidyverse non-standard evaluation
# https://www.r-bloggers.com/2019/08/no-visible-binding-for-global-variable/
utils::globalVariables(c(
"x",
"start",
"end",
":=",
"intron_start",
"intron_end",
"exons",
"xend",
"mid",
"index",
"diff_type",
"in_exons",
"in_ref_exons",
".",
# shorten_gaps()
"width",
"rescaled_start",
"rescaled_end",
"width_tx_start",
"seqnames",
"strand",
"shorten_type",
"gap_width",
"shortened_gap_width",
"shortened_gap_diff",
"sum_shortened_gap_diff",
"intron_indexes",
"shortened_width",
"type",
# add_exon_number()
"exon_number",
"n_exons",
# geom_junction_label_repel()
"group",
"median",
"median_x",
"median_diff",
"x_points",
"y_points",
"y",
"y_original",
"position_nudge_repel",
# add_utr(),
"dummy_group"
))
================================================
FILE: R/shorten_gaps.R
================================================
#' Improve transcript structure visualization by shortening gaps
#'
#' For a given set of exons and introns, `shorten_gaps()` reduces the width of
#' gaps (regions that do not overlap any `exons`) to a user-inputted
#' `target_gap_width`. This can be useful when visualizing transcripts that have
#' long introns, to hone in on the regions of interest (i.e. exons) and better
#' compare between transcript structures.
#'
#' After `shorten_gaps()` reduces the size of gaps, it will re-scale `exons` and
#' `introns` to preserve exon alignment. This process will only reduce the width
#' of input `introns`, never `exons`. Importantly, the outputted re-scaled
#' co-ordinates should only be used for visualization as they will not match the
#' original genomic coordinates.
#'
#' @inheritParams to_diff
#' @param introns `data.frame()` the intron co-ordinates corresponding to the
#' input `exons`. This can be created by applying `to_intron()` to the
#' `exons`. If introns originate from multiple transcripts, they must be
#' differentiated using `group_var`. If a user is not using `to_intron()`,
#' they must make sure intron start/ends are defined precisely as the adjacent
#' exon boundaries (rather than exon end + 1 and exon start - 1).
#' @param target_gap_width `integer()` the width in base pairs to shorten the
#' gaps to.
#'
#' @return `data.frame()` contains the re-scaled co-ordinates of `introns` and
#' `exons` of each input transcript with shortened gaps.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' pknox1_annotation %>% head()
#'
#' # extract exons
#' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
#' pknox1_exons %>% head()
#'
#' # to_intron() is a helper function included in ggtranscript
#' # which is useful for converting exon co-ordinates to introns
#' pknox1_introns <- pknox1_exons %>% to_intron(group_var = "transcript_name")
#' pknox1_introns %>% head()
#'
#' # for transcripts with long introns, the exons of interest
#' # can be difficult to visualize clearly when using the default scale
#' pknox1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = pknox1_introns,
#' arrow.min.intron.length = 3500
#' )
#'
#' # in such cases it can be useful to rescale the exons and introns
#' # using shorten_gaps() which shortens regions that do not overlap an exon
#' pknox1_rescaled <-
#' shorten_gaps(pknox1_exons, pknox1_introns, group_var = "transcript_name")
#'
#' pknox1_rescaled %>% head()
#'
#' # this allows us to visualize differences in exonic structure more clearly
#' pknox1_rescaled %>%
#' dplyr::filter(type == "exon") %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = pknox1_rescaled %>% dplyr::filter(type == "intron"),
#' arrow.min.intron.length = 300
#' )
#'
#' # shorten_gaps() can be used in combination with to_diff()
#' # to further highlight differences in exon structure
#' # here, all other transcripts are compared to the MANE-select transcript
#' pknox1_rescaled_diffs <- to_diff(
#' exons = pknox1_rescaled %>%
#' dplyr::filter(type == "exon", transcript_name != "PKNOX1-201"),
#' ref_exons = pknox1_rescaled %>%
#' dplyr::filter(type == "exon", transcript_name == "PKNOX1-201"),
#' group_var = "transcript_name"
#' )
#'
#' pknox1_rescaled %>%
#' dplyr::filter(type == "exon") %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = pknox1_rescaled %>% dplyr::filter(type == "intron"),
#' arrow.min.intron.length = 300
#' ) +
#' geom_range(
#' data = pknox1_rescaled_diffs,
#' aes(fill = diff_type),
#' alpha = 0.2
#' )
shorten_gaps <- function(exons,
introns,
group_var = NULL,
target_gap_width = 100L) {
# input checks
.check_coord_object(exons, check_seqnames = TRUE, check_strand = TRUE)
.check_coord_object(introns, check_seqnames = TRUE, check_strand = TRUE)
.check_group_var(exons, group_var)
.check_group_var(introns, group_var)
target_gap_width <- .check_target_gap_width(target_gap_width)
# check type column, create if not present
exons <- .get_type(exons, "exons")
introns <- .get_type(introns, "introns")
# to_intron() defines introns using the exon boundaries
# we need to convert this to the actual gap definition to make sure
# comparison to GenomicRanges::gaps() when using "equal" works correctly
# this is converted back in .get_rescaled_txs()
introns <- introns %>%
dplyr::mutate(
start = start + 1,
end = end - 1
)
# we use GenomicRanges methods for downstream processing
exons_gr <- GenomicRanges::GRanges(exons)
introns_gr <- GenomicRanges::GRanges(introns)
# obtain actual gaps, i.e. regions that overlap no exons
intron_gaps <- .get_gaps(exons_gr)
# by mapping gaps back to introns, we can then shorten overlapping gaps
gap_map_intron <- .get_gap_map(introns_gr, intron_gaps)
introns_shortened <- .get_shortened_gaps(
introns,
intron_gaps,
gap_map_intron,
group_var,
target_gap_width
)
# don't have to take tx_start_gaps into account if only 1 tx
if (!is.null(group_var)) {
# because we're shortening intron_gaps, we also need to shorten the
# region from start of the plot and start of each tx (tx_start_gaps)
tx_start_gaps <- .get_tx_start_gaps(exons, group_var)
gap_map_tx_start_gaps <- .get_gap_map(
tx_start_gaps %>% GenomicRanges::GRanges(),
intron_gaps
)
tx_start_gaps_shortened <- .get_shortened_gaps(
tx_start_gaps,
intron_gaps,
gap_map_tx_start_gaps,
group_var,
target_gap_width
) %>%
dplyr::select(-start, -end, -strand, -seqnames, -strand)
}
rescaled_tx <- .get_rescaled_txs(
exons,
introns_shortened,
tx_start_gaps_shortened,
group_var
)
return(rescaled_tx)
}
#' Add a type column if it is not present already
#'
#' @keywords internal
#' @noRd
.get_type <- function(x, exons_introns) {
if (!is.null(x[["type"]])) {
# if there is an existing type column for introns
# need to make sure this is all "intron" for downstream functions
# don't check for exons, as this can be variable (e.g. "five_prime_utr")
if (exons_introns == "introns") {
allowed_types <- "intron"
if (!all(x[["type"]] %in% allowed_types)) {
stop(
"values in the 'type' column of ", exons_introns, " must be one of: ",
allowed_types %>% paste0("'", ., "'") %>% paste(collapse = ", ")
)
}
}
} else {
# if there isn't, we add a default type column
default_type <- ifelse(exons_introns == "exons", "exon", "intron")
x <- x %>% dplyr::mutate(type = default_type)
}
return(x)
}
#' @keywords internal
#' @noRd
.get_gaps <- function(exons_gr) {
orig_seqnames <- exons_gr %>%
GenomicRanges::seqnames() %>%
as.character() %>%
unique()
orig_strand <- exons_gr %>%
GenomicRanges::strand() %>%
as.character() %>%
unique()
# make sure we only have exons from a single transcript
.check_len_1_strand_seqnames(orig_seqnames, orig_strand)
# "reduce" exons - here meaning to collapse into single meta transcript
exons_gr_reduced <- exons_gr %>% GenomicRanges::reduce()
# keep only the relevant seqnames, otherwise gaps includes all seqlevels
GenomeInfoDb::seqlevels(exons_gr_reduced, pruning.mode = "coarse") <-
orig_seqnames
# obtain intronic gaps of the meta transcript
intron_gaps <- exons_gr_reduced %>%
GenomicRanges::gaps(
start = min(GenomicRanges::start(exons_gr_reduced)),
end = max(GenomicRanges::end(exons_gr_reduced))
)
# gaps creates a gap per strand too, keep only those from the original strand
intron_gaps <- intron_gaps %>%
.[GenomicRanges::strand(intron_gaps) == orig_strand]
return(intron_gaps)
}
#' @keywords internal
#' @noRd
.get_tx_start_gaps <- function(exons, group_var) {
# need to scale the transcript starts so scaled introns/exons align
# importantly, this tx start also has to take into account
# whether intron_gaps that overlap it have been shortened
# here, get the tx_start_gap - the region between
# 1. the start of plot (smallest start position of all txs)
# 2. the start of each tx
tx_start_gaps <-
exons %>%
dplyr::group_by_at(.vars = c(
group_var
)) %>%
dplyr::summarise(
seqnames = unique(seqnames),
strand = unique(strand),
end = min(start), # min start of this transcript
start = min(exons[["start"]]) # min start of all transcripts
)
return(tx_start_gaps)
}
#' map the gaps back to introns/transcript start gaps
#' @keywords internal
#' @noRd
.get_gap_map <- function(y, intron_gaps) {
# when we reduce the length of the intron_gaps, whilst making sure
# whilst making sure the exons/introns remain aligned
# to do this, we need to map the intron_gaps back onto the introns
# the simplest case is when gaps are identical to original introns
equal_hits <- GenomicRanges::findOverlaps(
intron_gaps,
y,
type = "equal"
)
# often, the intron_gaps don't map identically
# this occurs due to the exons of one tx overlapping the intron of another
# we find cases when the gaps are completely contained an original intron
# using type = "within", but this also catches the "equal" intron_gaps
within_hits <- GenomicRanges::findOverlaps(
intron_gaps,
y,
type = "within"
)
# convert to data.frame() to use dplyr::anti_join()
equal_hits <- equal_hits %>% as.data.frame()
within_hits <- within_hits %>% as.data.frame()
# remove the "equal" hits from the "within"
pure_within_hits <- within_hits %>%
dplyr::anti_join(equal_hits, by = c("queryHits", "subjectHits"))
# need both "equal" and "pure_within" hits
gap_map <- list(
equal = equal_hits,
pure_within = pure_within_hits
)
return(gap_map)
}
#' @keywords internal
#' @noRd
.get_shortened_gaps <- function(y,
intron_gaps,
gap_map,
group_var,
target_gap_width) {
# we need the intron/tx_start_gap widths (to shorten them)
y <- y %>% dplyr::mutate(width = (end - start) + 1)
# characterise introns by shortening type
y_shortened <- y %>%
dplyr::mutate(
shorten_type = dplyr::case_when(
dplyr::row_number() %in% gap_map[["equal"]][["subjectHits"]] ~
"equal",
dplyr::row_number() %in% gap_map[["pure_within"]][["subjectHits"]] ~
"pure_within",
TRUE ~ "none"
)
)
# for the "equal" cases, simply shorten the widths to the target_gap_width
y_shortened <- y_shortened %>%
dplyr::mutate(
shortened_width = ifelse(
(shorten_type == "equal") & (width > target_gap_width),
target_gap_width,
width
)
)
# for the "within" cases we need to shorten the intron widths
# by the !total! amount the overlapping gaps are shortened
overlapping_gap_indexes <- gap_map[["pure_within"]][["queryHits"]]
# only have to this if there are gaps that are "pure_within"
if (length(overlapping_gap_indexes) > 0) {
# one intron may overlap multiple gaps
# first, calculate the sum of the reduction in gap widths
sum_gap_diff <- dplyr::tibble(
intron_indexes = gap_map[["pure_within"]][["subjectHits"]],
gap_width = GenomicRanges::width(intron_gaps)[overlapping_gap_indexes]
) %>%
dplyr::mutate(
shortened_gap_width = ifelse(
gap_width > target_gap_width,
target_gap_width,
gap_width
),
shortened_gap_diff = gap_width - shortened_gap_width,
) %>%
dplyr::group_by(intron_indexes) %>%
dplyr::summarise(
sum_shortened_gap_diff = sum(shortened_gap_diff)
)
# now actually do reduction for introns with "pure_within" gaps
y_shortened[["sum_shortened_gap_diff"]] <- NA_integer_
y_shortened[["sum_shortened_gap_diff"]][sum_gap_diff[["intron_indexes"]]] <-
sum_gap_diff[["sum_shortened_gap_diff"]]
y_shortened <- y_shortened %>%
dplyr::mutate(
shortened_width = ifelse(
is.na(sum_shortened_gap_diff),
shortened_width,
width - sum_shortened_gap_diff
)
) %>%
dplyr::select(-sum_shortened_gap_diff)
}
# remove unecessary intermediate cols
y_shortened <- y_shortened %>%
dplyr::select(
-shorten_type,
-width,
width = shortened_width
)
return(y_shortened)
}
#' @keywords internal
#' @noRd
.get_rescaled_txs <- function(exons,
introns_shortened,
tx_start_gaps_shortened,
group_var) {
# calculate the rescaled exon/intron start/ends using
# the widths of the exons and reduced introns
rescaled_tx <- exons %>% dplyr::mutate(
width = (end - start) + 1
)
# bind together exons and introns and arrange into genomic order
rescaled_tx <- rescaled_tx %>%
dplyr::bind_rows(
introns_shortened
) %>%
dplyr::arrange_at(.vars = c(group_var, "start", "end"))
# calculate the rescaled coords using cumsum of the widths of introns/exons
rescaled_tx <- rescaled_tx %>%
dplyr::group_by_at(.vars = c(
group_var
)) %>%
dplyr::mutate(
rescaled_end = cumsum(width),
rescaled_start = rescaled_end - (width - 1)
) %>%
dplyr::ungroup()
# account for the tx starts being in different places
# to keep everything aligned
if (is.null(group_var)) {
# if only 1 tx, we use 1 as the dummy rescaled tx_start
rescaled_tx <- rescaled_tx %>%
dplyr::mutate(width_tx_start = 1)
} else {
rescaled_tx <- rescaled_tx %>%
dplyr::left_join(
tx_start_gaps_shortened,
by = c(group_var),
suffix = c("", "_tx_start")
)
}
rescaled_tx <- rescaled_tx %>%
dplyr::mutate(
rescaled_end = rescaled_end + width_tx_start,
rescaled_start = rescaled_start + width_tx_start
) %>%
dplyr::select(-dplyr::contains("width"))
# convert introns back to be defined by exon boundaries, match to_intron()
rescaled_tx <- rescaled_tx %>%
dplyr::mutate(
start = ifelse(type == "intron", start - 1, start),
end = ifelse(type == "intron", end + 1, end),
rescaled_start = ifelse(
type == "intron", rescaled_start - 1, rescaled_start
),
rescaled_end = ifelse(
type == "intron", rescaled_end + 1, rescaled_end
)
)
# remove original start/end
rescaled_tx <- rescaled_tx %>% dplyr::select(-start, -end)
rescaled_tx <- rescaled_tx %>%
dplyr::select(
seqnames,
start = rescaled_start,
end = rescaled_end,
strand,
dplyr::everything()
)
return(rescaled_tx)
}
#' we expect the exons to originate from a single gene.
#' therefore, unique strand and seqnames should be of length 1
#' @keywords internal
#' @noRd
.check_len_1_strand_seqnames <- function(orig_seqnames, orig_strand) {
ab_1_uniq <- "of object contains more than 1 unique value. "
reason <- "object is expected to contain exons from a single gene."
if (length(orig_seqnames) != 1) {
stop("seqnames ", ab_1_uniq, reason)
}
if (length(orig_strand) != 1) {
stop("strand ", ab_1_uniq, reason)
}
}
#' @keywords internal
#' @noRd
.check_target_gap_width <- function(target_gap_width) {
if (!is.integer(target_gap_width)) {
warning("target_gap_width must be an integer, coercing...")
target_gap_width <- target_gap_width %>%
as.integer()
}
return(target_gap_width)
}
================================================
FILE: R/to_diff.R
================================================
#' Obtain the differences between transcript structure
#'
#' `to_diff()` obtains the difference between `exons` from a set of transcripts
#' to a reference transcript (`ref_exons`). This can be useful when visualizing
#' the differences between transcript structure. `to_diff()` expects two sets of
#' input exons; 1. `exons` - exons from any number of transcripts that will be
#' compared to `ref_exons` and 2. `ref_exons` - exons from a single transcript
#' which acts as the reference to compare against.
#'
#' @param exons `data.frame()` contains exons which can originate from multiple
#' transcripts differentiated by `group_var`.
#' @param ref_exons `data.frame()` contains exons that originate from a single
#' transcript, which `exons` will be compared against.
#' @param group_var `character()` if input data originates from more than 1
#' transcript, `group_var` must specify the column that differentiates
#' transcripts (e.g. "transcript_id").
#'
#' @return `data.frame()` details the differences between `exons` and
#' `ref_exons`.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' # for this example, let's compare transcripts to the MANE-select transcript
#' sod1_mane <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
#' sod1_not_mane <- sod1_exons %>% dplyr::filter(transcript_name != "SOD1-201")
#'
#' # to_diff() obtains the differences between the exons as ranges
#' sod1_diffs <- to_diff(
#' exons = sod1_not_mane,
#' ref_exons = sod1_mane,
#' group_var = "transcript_name"
#' )
#'
#' sod1_diffs %>% head()
#'
#' # using geom_range(), it can be useful to visually overlay
#' # the differences on top of the transcript annotation
#' sod1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = to_intron(sod1_exons, "transcript_name")
#' ) +
#' geom_range(
#' data = sod1_diffs,
#' ggplot2::aes(fill = diff_type),
#' alpha = 0.2
#' )
to_diff <- function(exons, ref_exons, group_var = NULL) {
.check_coord_object(exons, check_seqnames = TRUE, check_strand = TRUE)
.check_coord_object(ref_exons, check_seqnames = TRUE, check_strand = TRUE)
.check_group_var(exons, group_var)
# need to remember if group is NULL for downstream
null_group <- is.null(group_var)
# we have to create dummy group if there is no group for .get_diff
if (null_group) {
exons <- exons %>% dplyr::mutate(dummy_group = "A")
group_var <- "dummy_group"
}
diffs <- .get_diff(exons, ref_exons, group_var)
# remove the dummy_group if created
if (null_group) diffs[[group_var]] <- NULL
return(diffs)
}
#' The heavy lifting of `to_diff()` happens here.
#'
#' @keywords internal
#' @noRd
.get_diff <- function(exons, ref_exons, group_var) {
groups <- exons[[group_var]] %>% unique()
# needs to be a genomic range for downstream processing
exons_gr <- GenomicRanges::GRanges(exons)
ref_exons_gr <- GenomicRanges::GRanges(ref_exons)
diffs <- vector("list", length = length(group_var))
for (i in seq_along(groups)) {
exons_gr_curr <- exons_gr %>%
.[GenomicRanges::mcols(exons_gr)[[group_var]] == groups[i]]
# get the disjoint pieces (flattening and breaking apart exons)
disjoint_pieces <- GenomicRanges::disjoin(
c(ref_exons_gr, exons_gr_curr)
)
# find whether the disjoint pieces overlap exons or ref_exons
# those that only overlap 1 are the differences
# TODO - perhaps allow modification of findOverlaps() via ... ?
overlap_exons <- GenomicRanges::findOverlaps(
disjoint_pieces, exons_gr_curr
)
overlap_ref_exons <- GenomicRanges::findOverlaps(
disjoint_pieces, ref_exons_gr
)
# convert pieces back to data.frame and classify diffs
# TODO - could improve efficiency by placing this step post-loop
# i.e. manipulate the grs instead
diff_curr <- disjoint_pieces %>%
as.data.frame() %>%
dplyr::mutate(
index = dplyr::row_number(),
type = "diff",
in_exons = index %in% S4Vectors::queryHits(overlap_exons),
in_ref_exons = index %in% S4Vectors::queryHits(overlap_ref_exons)
) %>%
dplyr::mutate(
diff_type = dplyr::case_when(
in_exons & in_ref_exons ~ "both",
in_exons & !in_ref_exons ~ "not_in_ref",
!in_exons & in_ref_exons ~ "in_ref"
)
)
# add back in group info
diff_curr[[group_var]] <- groups[i]
# keep only diffs and necessary cols
diffs[[i]] <-
diff_curr %>%
dplyr::filter(diff_type != "both") %>%
dplyr::select(-in_exons, -in_ref_exons, -index)
}
diffs <- diffs %>% do.call(dplyr::bind_rows, .)
return(diffs)
}
================================================
FILE: R/to_intron.R
================================================
#' Convert exon co-ordinates to introns
#'
#' Given a set of `exons`, `to_intron()` will return the corresponding introns.
#'
#' It is important to note that, for visualization purposes, `to_intron()`
#' defines introns precisely as the exon boundaries, rather than the intron
#' start/end being (exon end + 1)/(exon start - 1).
#'
#' @inheritParams to_diff
#'
#' @return `data.frame()` contains the intron co-ordinates.
#'
#' @export
#' @examples
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' # to_intron() is a helper function included in ggtranscript
#' # which is useful for converting exon co-ordinates to introns
#' sod1_introns <- sod1_exons %>% to_intron(group_var = "transcript_name")
#' sod1_introns %>% head()
#'
#' # this can be particular useful when combined with
#' # geom_range() and geom_intron()
#' # to visualize the core components of transcript annotation
#' sod1_exons %>%
#' ggplot(aes(
#' xstart = start,
#' xend = end,
#' y = transcript_name
#' )) +
#' geom_range() +
#' geom_intron(
#' data = to_intron(sod1_exons, "transcript_name")
#' )
to_intron <- function(exons, group_var = NULL) {
.check_coord_object(exons)
.check_group_var(exons, group_var)
# TODO - switch this to using GenomicRanges::gaps()?
if (!is.null(group_var)) {
exons <- exons %>% dplyr::group_by_at(.vars = group_var)
}
# make sure exons are arranged by coord, so that dplyr::lag works correctly
exons <- exons %>%
dplyr::arrange(start, end)
# obtain intron start and ends
introns <- exons %>%
dplyr::mutate(
intron_start := dplyr::lag(end),
intron_end := start,
type = "intron"
) %>%
dplyr::select(-start, -end)
# remove the introduced artifact NAs
introns <- introns %>%
dplyr::ungroup() %>%
dplyr::filter(!is.na(intron_start) & !is.na(intron_end))
# filter out introns with a width of 1, this should only happen when
# utrs are included and are directly adjacent to end of cds
introns <- introns %>% dplyr::filter(abs(intron_end - intron_start) != 1)
introns <- introns %>% dplyr::rename(start = intron_start, end = intron_end)
return(introns)
}
================================================
FILE: R/utils.R
================================================
#' @keywords internal
#' @noRd
.check_coord_object <- function(x,
check_seqnames = FALSE,
check_strand = FALSE) {
if (!is.data.frame(x)) {
stop(
"object must be a data.frame. ",
"GRanges objects are currently not supported and must be converted ",
"using e.g. as.data.frame()"
)
}
if (!all(c("start", "end") %in% colnames(x))) {
stop("object must have the columns 'start' and 'end'")
}
if (check_seqnames) {
if (!("seqnames" %in% colnames(x))) {
stop("object must have the column 'seqnames'")
}
}
if (check_strand) {
if (!("strand" %in% colnames(x))) {
stop("object must have the column 'strand'")
}
}
}
#' @keywords internal
#' @noRd
.check_group_var <- function(x, group_var) {
if (!is.null(group_var)) {
if (!all(group_var %in% colnames(x))) {
stop(
"group_var ('", group_var, "') ",
"must be a column in object"
)
}
}
}
================================================
FILE: README.Rmd
================================================
---
output: github_document
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.path = "man/figures/README-",
out.width = "100%",
dpi = 300
)
```
# ggtranscript
[](https://github.com/dzhang32/ggtranscript/issues)
[](https://github.com/dzhang32/ggtranscript/pulls)
[](https://lifecycle.r-lib.org/articles/stages.html#experimental)
[](https://github.com/dzhang32/ggtranscript/actions)
[](https://app.codecov.io/gh/dzhang32/ggtranscript?branch=main)
`ggtranscript` is a `ggplot2` extension that makes it to easy to visualize transcript structure and annotation.
## Installation
```{r "install_dev", eval = FALSE}
# you can install the development version of ggtranscript from GitHub:
# install.packages("devtools")
devtools::install_github("dzhang32/ggtranscript")
```
## Usage
`ggtranscript` introduces 5 new geoms (`geom_range()`, `geom_half_range()`, `geom_intron()`, `geom_junction()` and `geom_junction_label_repel()`) and several helper functions designed to facilitate the visualization of transcript structure and annotation. The following guide takes you on a quick tour of using these geoms, for a more detailed overview see the [Getting Started tutorial](https://dzhang32.github.io/ggtranscript/articles/ggtranscript.html).
`geom_range()` and `geom_intron()` enable the plotting of exons and introns, the core components of transcript annotation. `ggtranscript` also provides `to_intron()`, which converts exon co-ordinates to the corresponding introns. Together, `ggtranscript` enables users to plot transcript structures with only exons as the required input and just a few lines of code.
```{r geom-range-intron}
library(magrittr)
library(dplyr)
library(ggplot2)
library(ggtranscript)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation %>% head()
# extract exons
sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
sod1_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = to_intron(sod1_exons, "transcript_name"),
aes(strand = strand)
)
```
`ggtranscript` provides the helper function `shorten_gaps()`, which reduces the size of the gaps. `shorten_gaps()` then rescales the exon and intron co-ordinates to preserve the original exon alignment. This allows you to hone in the differences in the exonic structure, which can be particularly useful if the transcript has relatively long introns.
```{r shorten-gaps}
sod1_rescaled <- shorten_gaps(
sod1_exons,
to_intron(sod1_exons, "transcript_name"),
group_var = "transcript_name"
)
sod1_rescaled %>%
dplyr::filter(type == "exon") %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = sod1_rescaled %>% dplyr::filter(type == "intron"),
arrow.min.intron.length = 200
)
```
`geom_range()` can be used for any range-based genomic annotation. For example, when plotting protein-coding transcripts, users may find it helpful to visually distinguish the coding segments from UTRs.
```{r geom-range-intron-w-cds}
# filter for only exons from protein coding transcripts
sod1_exons_prot_cod <- sod1_exons %>%
dplyr::filter(transcript_biotype == "protein_coding")
# obtain cds
sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS")
sod1_exons_prot_cod %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_cds
) +
geom_intron(
data = to_intron(sod1_exons_prot_cod, "transcript_name"),
aes(strand = strand),
arrow.min.intron.length = 500,
)
```
`geom_half_range()` takes advantage of the vertical symmetry of transcript annotation by plotting only half of a range on the top or bottom of a transcript structure. One use case of `geom_half_range()` is to visualize the differences between transcript structure more clearly.
```{r geom-half-range, fig.height = 3}
# extract exons and cds for the two transcripts to be compared
sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202")
sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202")
sod1_201_202_plot <- sod1_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = "SOD1-201/202"
)) +
geom_half_range(
fill = "white",
height = 0.125
) +
geom_half_range(
data = sod1_201_cds
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
) +
geom_half_range(
data = sod1_202_exons,
range.orientation = "top",
fill = "white",
height = 0.125
) +
geom_half_range(
data = sod1_202_cds,
range.orientation = "top",
fill = "purple"
) +
geom_intron(
data = to_intron(sod1_202_exons, "transcript_name")
)
sod1_201_202_plot
```
As a `ggplot2` extension, `ggtranscript` inherits the the familiarity and functionality of `ggplot2`. For instance, by leveraging `coord_cartesian()` users can zoom in on regions of interest.
```{r geom-half-range-zoomed, fig.height = 3}
sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000))
```
`geom_junction()` enables to plotting of junction curves, which can be overlaid across transcript structures. `geom_junction_label_repel()` adds a label to junction curves, which can often be useful to mark junctions with a metric of their usage such as read counts.
```{r geom-junction, fig.height = 3}
# ggtranscript includes a set of example (unannotated) junctions
# originating from GTEx and downloaded via the Bioconductor package snapcount
sod1_junctions
# add transcript_name to junctions for plotting
sod1_junctions <- sod1_junctions %>%
dplyr::mutate(transcript_name = "SOD1-201")
sod1_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_201_cds
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
) +
geom_junction(
data = sod1_junctions,
junction.y.max = 0.5
) +
geom_junction_label_repel(
data = sod1_junctions,
aes(label = round(mean_count, 2)),
junction.y.max = 0.5
)
```
Alternatively, users may prefer to map junction read counts to the thickness of the junction curves. As a `ggplot2` extension, this can be done intuitively by modifying the size `aes()` of `geom_junction()`. In addition, by modifying `ggplot2` scales and themes, users can easily create informative, publication-ready plots.
```{r geom-junction-pub, fig.height = 3}
sod1_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_201_cds
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
) +
geom_junction(
data = sod1_junctions,
aes(size = mean_count),
junction.y.max = 0.5,
ncp = 30,
colour = "purple"
) +
scale_size_continuous(range = c(0.1, 1), guide = "none") +
xlab("Genomic position (chr21)") +
ylab("Transcript name") +
theme_bw()
```
## Citation
```{r citing-ggtranscript}
citation("ggtranscript")
```
## Credits
* `ggtranscript` was developed using `biocthis`.
================================================
FILE: README.md
================================================
# ggtranscript
[](https://github.com/dzhang32/ggtranscript/issues)
[](https://github.com/dzhang32/ggtranscript/pulls)
[](https://lifecycle.r-lib.org/articles/stages.html#experimental)
[](https://github.com/dzhang32/ggtranscript/actions)
[](https://app.codecov.io/gh/dzhang32/ggtranscript?branch=main)
`ggtranscript` is a `ggplot2` extension that makes it to easy to
visualize transcript structure and annotation.
## Installation
``` r
# you can install the development version of ggtranscript from GitHub:
# install.packages("devtools")
devtools::install_github("dzhang32/ggtranscript")
```
## Usage
`ggtranscript` introduces 5 new geoms (`geom_range()`,
`geom_half_range()`, `geom_intron()`, `geom_junction()` and
`geom_junction_label_repel()`) and several helper functions designed to
facilitate the visualization of transcript structure and annotation. The
following guide takes you on a quick tour of using these geoms, for a
more detailed overview see the [Getting Started
tutorial](https://dzhang32.github.io/ggtranscript/articles/ggtranscript.html).
`geom_range()` and `geom_intron()` enable the plotting of exons and
introns, the core components of transcript annotation. `ggtranscript`
also provides `to_intron()`, which converts exon co-ordinates to the
corresponding introns. Together, `ggtranscript` enables users to plot
transcript structures with only exons as the required input and just a
few lines of code.
``` r
library(magrittr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
library(ggtranscript)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation %>% head()
#> # A tibble: 6 × 8
#> seqnames start end strand type gene_name transcript_name
#>
#> 1 21 31659666 31668931 + gene SOD1
#> 2 21 31659666 31668931 + transcript SOD1 SOD1-202
#> 3 21 31659666 31659784 + exon SOD1 SOD1-202
#> 4 21 31659770 31659784 + CDS SOD1 SOD1-202
#> 5 21 31659770 31659772 + start_codon SOD1 SOD1-202
#> 6 21 31663790 31663886 + exon SOD1 SOD1-202
#> # ℹ 1 more variable: transcript_biotype
# extract exons
sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
sod1_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = to_intron(sod1_exons, "transcript_name"),
aes(strand = strand)
)
```
`ggtranscript` provides the helper function `shorten_gaps()`, which
reduces the size of the gaps. `shorten_gaps()` then rescales the exon
and intron co-ordinates to preserve the original exon alignment. This
allows you to hone in the differences in the exonic structure, which can
be particularly useful if the transcript has relatively long introns.
``` r
sod1_rescaled <- shorten_gaps(
sod1_exons,
to_intron(sod1_exons, "transcript_name"),
group_var = "transcript_name"
)
sod1_rescaled %>%
dplyr::filter(type == "exon") %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = sod1_rescaled %>% dplyr::filter(type == "intron"),
arrow.min.intron.length = 200
)
```
`geom_range()` can be used for any range-based genomic annotation. For
example, when plotting protein-coding transcripts, users may find it
helpful to visually distinguish the coding segments from UTRs.
``` r
# filter for only exons from protein coding transcripts
sod1_exons_prot_cod <- sod1_exons %>%
dplyr::filter(transcript_biotype == "protein_coding")
# obtain cds
sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS")
sod1_exons_prot_cod %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_cds
) +
geom_intron(
data = to_intron(sod1_exons_prot_cod, "transcript_name"),
aes(strand = strand),
arrow.min.intron.length = 500,
)
```
`geom_half_range()` takes advantage of the vertical symmetry of
transcript annotation by plotting only half of a range on the top or
bottom of a transcript structure. One use case of `geom_half_range()` is
to visualize the differences between transcript structure more clearly.
``` r
# extract exons and cds for the two transcripts to be compared
sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202")
sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202")
sod1_201_202_plot <- sod1_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = "SOD1-201/202"
)) +
geom_half_range(
fill = "white",
height = 0.125
) +
geom_half_range(
data = sod1_201_cds
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
) +
geom_half_range(
data = sod1_202_exons,
range.orientation = "top",
fill = "white",
height = 0.125
) +
geom_half_range(
data = sod1_202_cds,
range.orientation = "top",
fill = "purple"
) +
geom_intron(
data = to_intron(sod1_202_exons, "transcript_name")
)
sod1_201_202_plot
```
As a `ggplot2` extension, `ggtranscript` inherits the the familiarity
and functionality of `ggplot2`. For instance, by leveraging
`coord_cartesian()` users can zoom in on regions of interest.
``` r
sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000))
```
`geom_junction()` enables to plotting of junction curves, which can be
overlaid across transcript structures. `geom_junction_label_repel()`
adds a label to junction curves, which can often be useful to mark
junctions with a metric of their usage such as read counts.
``` r
# ggtranscript includes a set of example (unannotated) junctions
# originating from GTEx and downloaded via the Bioconductor package snapcount
sod1_junctions
#> # A tibble: 5 × 5
#> seqnames start end strand mean_count
#>
#> 1 chr21 31659787 31666448 + 0.463
#> 2 chr21 31659842 31660554 + 0.831
#> 3 chr21 31659842 31663794 + 0.316
#> 4 chr21 31659842 31667257 + 4.35
#> 5 chr21 31660351 31663789 + 0.324
# add transcript_name to junctions for plotting
sod1_junctions <- sod1_junctions %>%
dplyr::mutate(transcript_name = "SOD1-201")
sod1_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_201_cds
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
) +
geom_junction(
data = sod1_junctions,
junction.y.max = 0.5
) +
geom_junction_label_repel(
data = sod1_junctions,
aes(label = round(mean_count, 2)),
junction.y.max = 0.5
)
```
Alternatively, users may prefer to map junction read counts to the
thickness of the junction curves. As a `ggplot2` extension, this can be
done intuitively by modifying the size `aes()` of `geom_junction()`. In
addition, by modifying `ggplot2` scales and themes, users can easily
create informative, publication-ready plots.
``` r
sod1_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_201_cds
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
) +
geom_junction(
data = sod1_junctions,
aes(size = mean_count),
junction.y.max = 0.5,
ncp = 30,
colour = "purple"
) +
scale_size_continuous(range = c(0.1, 1), guide = "none") +
xlab("Genomic position (chr21)") +
ylab("Transcript name") +
theme_bw()
#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
```
## Citation
``` r
citation("ggtranscript")
#> To cite package 'ggtranscript' in publications use:
#>
#> Gustavsson EK, Zhang D, Reynolds RH, Garcia-Ruiz S, Ryten M (2022).
#> "ggtranscript: an R package for the visualization and interpretation
#> of transcript isoforms using ggplot2." _Bioinformatics_.
#> doi:10.1093/bioinformatics/btac409
#> ,
#> .
#>
#> A BibTeX entry for LaTeX users is
#>
#> @Article{,
#> title = {ggtranscript: an R package for the visualization and interpretation of transcript isoforms using ggplot2},
#> author = {Emil K Gustavsson and David Zhang and Regina H Reynolds and Sonia Garcia-Ruiz and Mina Ryten},
#> year = {2022},
#> journal = {Bioinformatics},
#> doi = {https://doi.org/10.1093/bioinformatics/btac409},
#> url = {https://academic.oup.com/bioinformatics/article/38/15/3844/6617821},
#> }
```
## Credits
- `ggtranscript` was developed using `biocthis`.
================================================
FILE: _pkgdown.yml
================================================
template:
bootstrap: 5
bootswatch: cosmo
================================================
FILE: codecov.yml
================================================
comment: false
coverage:
status:
project:
default:
target: auto
threshold: 1%
informational: true
patch:
default:
target: auto
threshold: 1%
informational: true
================================================
FILE: data-raw/ggplot2_exts_thumbnail.R
================================================
# Load libraries ----------------------------------------------------------
library(tidyverse)
devtools::load_all(".")
# Main --------------------------------------------------------------------
sod1_201_exons <- sod1_annotation %>%
dplyr::filter(
type == "exon",
transcript_name == "SOD1-201"
)
sod1_201_cds <- sod1_annotation %>%
dplyr::filter(
type == "CDS",
transcript_name == "SOD1-201"
)
sod1_junctions <- sod1_junctions %>% dplyr::mutate(transcript_name = "SOD1-201")
ggplot2_exts_figure <- sod1_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.125
) +
geom_range(
data = sod1_201_cds,
height = 0.25
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
) +
geom_junction(
data = sod1_junctions,
aes(size = mean_count),
junction.y.max = 0.25,
ncp = 30,
colour = "purple"
) +
scale_size_continuous(range = c(0.1, 1), guide = "none") +
xlab("Genomic position (chr21)") +
ylab("Transcript name") +
theme_bw() +
theme(
axis.line = element_line(colour = "black"),
panel.grid = element_blank(),
panel.border = element_blank()
)
ggplot2_exts_figure
# Save data ---------------------------------------------------------------
ggsave(
plot = ggplot2_exts_figure,
filename = here::here("man", "figures", "dzhang32-ggtranscript.png"),
height = 3,
width = 3.5,
dpi = 600
)
================================================
FILE: data-raw/ggtranscript_logo.R
================================================
# Load libraries ----------------------------------------------------------
library(tidyverse)
library(hexSticker)
library(showtext)
devtools::load_all(".")
# Main --------------------------------------------------------------------
logo_exons <- tribble(
~start, ~end, ~tx, ~letter,
#-------|-----|----|--------
150, 200, "J", "T",
500, 550, "J", "T",
300, 310, "I", "T",
350, 400, "I", "T",
300, 350, "H", "T",
390, 400, "H", "T",
300, 310, "G", "T",
350, 400, "G", "T",
700, 800, "J", "X_top",
1100, 1200, "J", "X_top",
700, 800, "G", "X_bot",
1100, 1200, "G", "X_bot"
) %>%
dplyr::mutate(
tx = tx %>% factor(
levels = LETTERS[1:17]
)
)
logo_utr <- tribble(
~start, ~end, ~tx, ~letter,
#-------|-----|----|--------
100, 150, "J", "T",
550, 600, "J", "T",
)
logo_introns <- logo_exons %>%
dplyr::filter(letter == "T") %>%
to_intron(group_var = "tx")
logo_junctions <- logo_exons %>%
dplyr::filter(letter %in% c("X_top", "X_bot")) %>%
to_intron(group_var = "tx")
size <- 0.3
colour <- "black"
fill <- ggpubr::get_palette("jco", 10)[10]
# create T
ggtranscript_logo <- logo_exons %>%
dplyr::filter(letter == "T") %>%
ggplot(aes(
xstart = start,
xend = end,
y = tx
)) +
geom_range(
fill = fill,
size = size,
colour = colour
) +
geom_range(
data = logo_utr,
fill = "white",
height = 0.25,
size = size,
colour = colour
) +
geom_intron(
data = logo_introns,
size = size,
colour = colour,
arrow.min.intron.length = 100
)
ggtranscript_logo <- ggtranscript_logo +
geom_half_range(
data = logo_exons %>% dplyr::filter(letter == "X_top"),
range.orientation = "top",
fill = fill,
size = size,
colour = colour,
) +
geom_half_range(
data = logo_exons %>% dplyr::filter(letter == "X_bot"),
fill = fill,
size = size,
colour = colour,
) +
geom_junction(
data = logo_junctions %>% dplyr::filter(letter == "X_bot"),
size = size,
colour = colour,
junction.orientation = "top",
junction.y.max = 1.4,
ncp = 50
) +
geom_junction(
data = logo_junctions %>% dplyr::filter(letter == "X_top"),
size = size,
colour = colour,
junction.orientation = "bottom",
junction.y.max = 1.4,
ncp = 50
)
ggtranscript_logo <- ggtranscript_logo +
scale_x_continuous(
limits = c(-300, 1600),
minor_breaks = seq(-300, 1500, 100)
) +
scale_y_discrete(drop = FALSE) +
theme_bw() +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
panel.grid.major = element_line(
size = size,
colour = ggpubr::get_palette("Greys", 10)[2]
),
panel.grid.minor = element_line(
size = size,
colour = ggpubr::get_palette("Greys", 10)[2]
)
)
ggtranscript_logo
# Save data ---------------------------------------------------------------
# use font from https://fonts.google.com
font_add_google(name = "Raleway", family = "Raleway")
showtext_auto()
ggtranscript_logo_hex <- hexSticker::sticker(
# the plot (TX)
subplot = ggtranscript_logo,
s_x = 0.98,
s_y = 1.2,
s_width = 2.8,
s_height = 3,
# the package
package = "ggtranscript",
p_x = 1,
p_y = 0.65,
p_size = 35,
p_family = "Raleway",
p_fontface = "bold",
p_color = ggpubr::get_palette("jco", 10)[6],
# hex border
h_color = ggpubr::get_palette("jco", 10)[6],
h_fill = "white",
h_size = 2,
# url
url = "https://github.com/dzhang32/ggtranscript",
u_family = "Raleway",
u_color = ggpubr::get_palette("jco", 10)[6],
u_size = 6.5,
# general
filename = here::here("man", "figures", "ggtranscript_logo.png"),
dpi = 600,
white_around_sticker = TRUE
)
# here::here("man", "figures", "ggtranscript_logo.png") is then
# manually cropped to remove the white background in Inkscape
plot(ggtranscript_logo_hex)
================================================
FILE: data-raw/sod1_junctions.R
================================================
# Load libraries ----------------------------------------------------------
library(tidyverse)
library(snapcount)
library(SummarizedExperiment)
# Main --------------------------------------------------------------------
# obtain gtex junctions across SOD1
sod1_query <- snapcount::QueryBuilder(compilation = "gtex", regions = "SOD1")
# keeping only unannotated junctions
# from liver where SOD1 is highly expressed
# https://gtexportal.org/home/gene/SOD1
sod1_query <- set_row_filters(sod1_query, annotated == 0)
sod1_query <- set_column_filters(sod1_query, SMTS == "Liver")
sod1_junctions <- snapcount::query_jx(sod1_query)
# obtain mean counts
mean_counts <-
sod1_junctions %>%
SummarizedExperiment::assays() %>%
.[["counts"]] %>%
as.matrix() %>%
rowMeans()
sod1_junctions <- sod1_junctions %>%
SummarizedExperiment::rowRanges() %>%
as.data.frame() %>%
dplyr::as_tibble()
# minor QC and tidying of the junctions
sod1_junctions <-
sod1_junctions %>%
dplyr::mutate(mean_count = mean_counts) %>%
dplyr::filter(mean_count > 0.3) %>%
dplyr::select(
seqnames,
start,
end,
strand,
mean_count
)
# Save data ---------------------------------------------------------------
usethis::use_data(
sod1_junctions,
compress = "gzip",
overwrite = TRUE
)
================================================
FILE: data-raw/sod1_pknox1_annotation.R
================================================
# Load libraries ----------------------------------------------------------
library(tidyverse)
library(rtracklayer)
library(R.utils)
# Main --------------------------------------------------------------------
gtf_path <- file.path(tempdir(), "Homo_sapiens.GRCh38.105.chr.gtf.gz")
# download ens 105 gtf
download.file(
stringr::str_c(
"http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/",
"Homo_sapiens.GRCh38.105.chr.gtf.gz"
),
destfile = gtf_path
)
# unzip gtf
R.utils::gunzip(gtf_path)
gtf_path <- gtf_path %>%
stringr::str_remove("\\.gz$")
gtf <- rtracklayer::import(gtf_path)
# extract example gene transcripts
# convert to tibble()
sod1_annotation <-
gtf[!is.na(gtf$gene_name) & gtf$gene_name == "SOD1"] %>%
as.data.frame() %>%
dplyr::as_tibble() %>%
dplyr::select(
seqnames,
start,
end,
strand,
type,
gene_name,
transcript_name,
transcript_biotype
)
pknox1_annotation <-
gtf[!is.na(gtf$gene_name) & gtf$gene_name == "PKNOX1"] %>%
as.data.frame() %>%
dplyr::as_tibble() %>%
dplyr::select(
seqnames,
start,
end,
strand,
type,
gene_name,
transcript_name,
transcript_biotype
)
# Save data ---------------------------------------------------------------
usethis::use_data(
sod1_annotation,
compress = "gzip",
overwrite = TRUE
)
usethis::use_data(
pknox1_annotation,
compress = "gzip",
overwrite = TRUE
)
================================================
FILE: inst/CITATION
================================================
pkgVer <- function(pkg) {
if (!exists("meta") || is.null(meta)) meta <- packageDescription(pkg)
ver <- meta$Version
paste0('https://github.com/dzhang32/', pkg, ' - R package version ', ver)
}
c(
bibentry(bibtype="article",
title = "ggtranscript: an R package for the visualization and interpretation of transcript isoforms using ggplot2",
author = personList(
as.person("Emil K Gustavsson"),
as.person("David Zhang"),
as.person("Regina H Reynolds"),
as.person("Sonia Garcia-Ruiz"),
as.person("Mina Ryten")
),
year = 2022,
journal = "Bioinformatics",
doi = "https://doi.org/10.1093/bioinformatics/btac409",
url = "https://academic.oup.com/bioinformatics/article/38/15/3844/6617821"
)
)
================================================
FILE: man/add_exon_number.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/add_exon_number.R
\name{add_exon_number}
\alias{add_exon_number}
\title{Add exon number}
\usage{
add_exon_number(exons, group_var = NULL)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}
\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
}
\value{
\code{data.frame()} equivalent to input \code{exons}, with the additional
column "exon_number".
}
\description{
\code{add_exon_number()} adds the exon number (the order the exons are transcribed
within each transcript) as a column in \code{exons}. This can be useful when
visualizing long, complex transcript structures, in order to keep track of
specific exons of interest.
}
\details{
To note, a "strand" column must be present within \code{exons}. The strand is used
to differentiate whether exon numbers should be calculated according to
ascending ("+") or descending ("-") genomic co-ordinates. For ambiguous
strands ("*"), \code{add_exon_number()} will be assume the strand be "+".
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()
# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon")
sod1_exons \%>\% head()
# add the exon number for each transcript
sod1_exons <- sod1_exons \%>\% add_exon_number(group_var = "transcript_name")
base <- sod1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = to_intron(sod1_exons, "transcript_name"),
strand = "+"
)
# it can be useful to annotate exons with their exon number
# using ggplot2::geom_text()
base +
geom_text(aes(
x = (start + end) / 2, # plot label at midpoint of exon
label = exon_number
),
size = 3.5,
nudge_y = 0.4
)
# Or alternatively, using ggrepel::geom_label_repel()
# to separate labels from exons
base +
ggrepel::geom_label_repel(ggplot2::aes(
x = (start + end) / 2,
label = exon_number
),
size = 3.5,
min.segment.length = 0
)
}
================================================
FILE: man/add_utr.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/add_utr.R
\name{add_utr}
\alias{add_utr}
\title{Add untranslated regions (UTRs)}
\usage{
add_utr(exons, cds, group_var = NULL)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}
\item{cds}{\code{data.frame()} contains coding sequence ranges for the transcripts
in \code{exons}.}
\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
}
\value{
\code{data.frame()} contains differentiated CDS and UTR ranges.
}
\description{
Given a set of \code{exons} (encompassing the CDS and UTRs) and \code{cds} regions,
\code{add_utr()} will calculate and add the corresponding UTR regions as ranges.
This can be useful when combined with \code{shorten_gaps()} to visualize
transcripts with long introns, whilst differentiating UTRs from CDS regions.
}
\details{
The definition of the inputted \code{cds} regions are expected to range from the
beginning of the start codon to the end of the stop codon. Sometimes, for
example in the case of Ensembl, reference annotation will omit the stop
codons from the CDS definition. In such cases, users should manually ensure
that the \code{cds} includes both the start and stop codons.
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
pknox1_annotation \%>\% head()
# extract exons
pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon")
pknox1_exons \%>\% head()
# extract cds
pknox1_cds <- pknox1_annotation \%>\% dplyr::filter(type == "CDS")
pknox1_cds \%>\% head()
# the CDS definition originating from the Ensembl reference annotation
# does not include the stop codon
# we must incorporate the stop codons into the CDS manually
# by adding 3 base pairs to the end of the CDS of each transcript
pknox1_cds_w_stop <- pknox1_cds \%>\%
dplyr::group_by(transcript_name) \%>\%
dplyr::mutate(
end = ifelse(end == max(end), end + 3, end)
) \%>\%
dplyr::ungroup()
# add_utr() adds ranges that represent the UTRs
pknox1_cds_utr <- add_utr(
pknox1_exons,
pknox1_cds_w_stop,
group_var = "transcript_name"
)
pknox1_cds_utr \%>\% head()
# this can be useful when combined with shorten_gaps()
# to visualize transcripts with long introns whilst differentiating UTRs
pknox1_cds_utr_rescaled <-
shorten_gaps(
exons = pknox1_cds_utr,
introns = to_intron(pknox1_cds_utr, "transcript_name"),
group_var = "transcript_name"
)
pknox1_cds_utr_rescaled \%>\%
dplyr::filter(type == "CDS") \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_range(
data = pknox1_cds_utr_rescaled \%>\% dplyr::filter(type == "UTR"),
height = 0.25,
fill = "white"
) +
geom_intron(
data = to_intron(
pknox1_cds_utr_rescaled \%>\% dplyr::filter(type != "intron"),
"transcript_name"
),
arrow.min.intron.length = 110
)
}
================================================
FILE: man/geom_intron.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/geom_intron.R
\name{geom_intron}
\alias{geom_intron}
\title{Plot intron lines with strand arrows}
\usage{
geom_intron(
mapping = NULL,
data = NULL,
stat = "identity",
position = "identity",
...,
arrow = grid::arrow(ends = "last", length = grid::unit(0.1, "inches")),
arrow.fill = NULL,
lineend = "butt",
linejoin = "round",
na.rm = FALSE,
arrow.min.intron.length = 0,
show.legend = NA,
inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and
\code{inherit.aes = TRUE} (the default), it is combined with the default mapping
at the top level of the plot. You must supply \code{mapping} if there is no plot
mapping.}
\item{data}{The data to be displayed in this layer. There are three
options:
If \code{NULL}, the default, the data is inherited from the plot
data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}.
A \code{data.frame}, or other object, will override the plot
data. All objects will be fortified to produce a data frame. See
\code{\link[ggplot2:fortify]{fortify()}} for which variables will be created.
A \code{function} will be called with a single argument,
the plot data. The return value must be a \code{data.frame}, and
will be used as the layer data. A \code{function} can be created
from a \code{formula} (e.g. \code{~ head(.x, 10)}).}
\item{stat}{The statistical transformation to use on the data for this layer.
When using a \verb{geom_*()} function to construct a layer, the \code{stat}
argument can be used the override the default coupling between geoms and
stats. The \code{stat} argument accepts the following:
\itemize{
\item A \code{Stat} ggproto subclass, for example \code{StatCount}.
\item A string naming the stat. To give the stat as a string, strip the
function name of the \code{stat_} prefix. For example, to use \code{stat_count()},
give the stat as \code{"count"}.
\item For more information and other ways to specify the stat, see the
\link[ggplot2:layer_stats]{layer stat} documentation.
}}
\item{position}{A position adjustment to use on the data for this layer. This
can be used in various ways, including to prevent overplotting and
improving the display. The \code{position} argument accepts the following:
\itemize{
\item The result of calling a position function, such as \code{position_jitter()}.
This method allows for passing extra arguments to the position.
\item A string naming the position adjustment. To give the position as a
string, strip the function name of the \code{position_} prefix. For example,
to use \code{position_jitter()}, give the position as \code{"jitter"}.
\item For more information and other ways to specify the position, see the
\link[ggplot2:layer_positions]{layer position} documentation.
}}
\item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These
arguments broadly fall into one of 4 categories below. Notably, further
arguments to the \code{position} argument, or aesthetics that are required
can \emph{not} be passed through \code{...}. Unknown arguments that are not part
of the 4 categories below are ignored.
\itemize{
\item Static aesthetics that are not mapped to a scale, but are at a fixed
value and apply to the layer as a whole. For example, \code{colour = "red"}
or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics}
section that lists the available options. The 'required' aesthetics
cannot be passed on to the \code{params}. Please note that while passing
unmapped aesthetics as vectors is technically possible, the order and
required length is not guaranteed to be parallel to the input data.
\item When constructing a layer using
a \verb{stat_*()} function, the \code{...} argument can be used to pass on
parameters to the \code{geom} part of the layer. An example of this is
\code{stat_density(geom = "area", outline.type = "both")}. The geom's
documentation lists which parameters it can accept.
\item Inversely, when constructing a layer using a
\verb{geom_*()} function, the \code{...} argument can be used to pass on parameters
to the \code{stat} part of the layer. An example of this is
\code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation
lists which parameters it can accept.
\item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through
\code{...}. This can be one of the functions described as
\link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend.
}}
\item{arrow}{specification for arrow heads, as created by \code{\link[grid:arrow]{grid::arrow()}}.}
\item{arrow.fill}{fill colour to use for the arrow head (if closed). \code{NULL}
means use \code{colour} aesthetic.}
\item{lineend}{Line end style (round, butt, square).}
\item{linejoin}{Line join style (round, mitre, bevel).}
\item{na.rm}{If \code{FALSE}, the default, missing values are removed with
a warning. If \code{TRUE}, missing values are silently removed.}
\item{arrow.min.intron.length}{\code{integer()} the minimum required width of an
intron for a strand arrow to be drawn. This can be useful to remove strand
arrows on short introns that overlap adjacent exons.}
\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped.
\code{FALSE} never includes, and \code{TRUE} always includes.
It can also be a named logical vector to finely select the aesthetics to
display.}
\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
rather than combining with them. This is most useful for helper functions
that define both data and aesthetics and shouldn't inherit behaviour from
the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.}
}
\value{
the return value of a \verb{geom_*} function is not intended to be
directly handled by users. Therefore, \verb{geom_*} functions should never be
executed in isolation, rather used in combination with a
\code{ggplot2::ggplot()} call.
}
\description{
\code{geom_intron()} draws horizontal lines with central arrows that are designed
to represent introns. In combination with \code{geom_range()}/\code{geom_half_range()},
these geoms form the core components for visualizing transcript structures.
}
\details{
\code{geom_intron()} requires the following \code{aes()}; \code{xstart}, \code{xend} and \code{y}
(e.g. transcript name). If users do not have intron co-ordinates, these can
be generated from the corresponding exons using \code{to_intron()}. The \code{strand}
option (one of "+" or "-") adjusts the arrow direction to match the direction
of transcription. The \code{arrow.min.intron.length} parameter can be useful to
remove strand arrows that overlap exons, which can be a problem if plotted
introns include those that are relatively short.
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
pknox1_annotation \%>\% head()
# extract exons
pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon")
pknox1_exons \%>\% head()
# to_intron() is a helper function included in ggtranscript
# which is useful for converting exon co-ordinates to introns
pknox1_introns <- pknox1_exons \%>\% to_intron(group_var = "transcript_name")
pknox1_introns \%>\% head()
base <- pknox1_introns \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
))
# by default, geom_intron() assumes introns originate from the "+" strand
base + geom_intron()
# however this can be modified using the strand option
base + geom_intron(strand = "-")
# strand can also be set as an aes()
base + geom_intron(aes(strand = strand))
# as a ggplot2 extension, ggtranscript geoms inherit the
# the functionality from the parameters and aesthetics in ggplot2
base + geom_intron(
aes(colour = transcript_name),
linewidth = 1
)
# together, geom_range() and geom_intron() are designed to visualize
# the core components of transcript annotation
pknox1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = pknox1_introns
)
# for short introns, sometimes strand arrows will overlap exons
# to avoid this, users can set the arrow.min.intron.length parameter
pknox1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = pknox1_introns,
arrow.min.intron.length = 3500
)
}
================================================
FILE: man/geom_junction.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/geom_junction.R
\name{geom_junction}
\alias{geom_junction}
\title{Plot junction curves}
\usage{
geom_junction(
mapping = NULL,
data = NULL,
stat = "identity",
position = "identity",
junction.orientation = "alternating",
junction.y.max = 1,
angle = 90,
ncp = 15,
na.rm = FALSE,
orientation = NA,
show.legend = NA,
inherit.aes = TRUE,
...
)
}
\arguments{
\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and
\code{inherit.aes = TRUE} (the default), it is combined with the default mapping
at the top level of the plot. You must supply \code{mapping} if there is no plot
mapping.}
\item{data}{The data to be displayed in this layer. There are three
options:
If \code{NULL}, the default, the data is inherited from the plot
data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}.
A \code{data.frame}, or other object, will override the plot
data. All objects will be fortified to produce a data frame. See
\code{\link[ggplot2:fortify]{fortify()}} for which variables will be created.
A \code{function} will be called with a single argument,
the plot data. The return value must be a \code{data.frame}, and
will be used as the layer data. A \code{function} can be created
from a \code{formula} (e.g. \code{~ head(.x, 10)}).}
\item{stat}{The statistical transformation to use on the data for this layer.
When using a \verb{geom_*()} function to construct a layer, the \code{stat}
argument can be used the override the default coupling between geoms and
stats. The \code{stat} argument accepts the following:
\itemize{
\item A \code{Stat} ggproto subclass, for example \code{StatCount}.
\item A string naming the stat. To give the stat as a string, strip the
function name of the \code{stat_} prefix. For example, to use \code{stat_count()},
give the stat as \code{"count"}.
\item For more information and other ways to specify the stat, see the
\link[ggplot2:layer_stats]{layer stat} documentation.
}}
\item{position}{A position adjustment to use on the data for this layer. This
can be used in various ways, including to prevent overplotting and
improving the display. The \code{position} argument accepts the following:
\itemize{
\item The result of calling a position function, such as \code{position_jitter()}.
This method allows for passing extra arguments to the position.
\item A string naming the position adjustment. To give the position as a
string, strip the function name of the \code{position_} prefix. For example,
to use \code{position_jitter()}, give the position as \code{"jitter"}.
\item For more information and other ways to specify the position, see the
\link[ggplot2:layer_positions]{layer position} documentation.
}}
\item{junction.orientation}{\code{character()} one of "alternating", "top" or
"bottom", specifying where the junctions will be plotted with respect to
each transcript (\code{y}).}
\item{junction.y.max}{\code{double()} the max y-value of each junction curve. It
can be useful to adjust this parameter when junction curves overlap with
one another/other transcripts or extend beyond the plot margins.}
\item{angle}{A numeric value between 0 and 180,
giving an amount to skew the control
points of the curve. Values less than 90 skew the curve towards
the start point and values greater than 90 skew the curve
towards the end point.}
\item{ncp}{The number of control points used to draw the curve.
More control points creates a smoother curve.}
\item{na.rm}{If \code{FALSE}, the default, missing values are removed with
a warning. If \code{TRUE}, missing values are silently removed.}
\item{orientation}{The orientation of the layer. The default (\code{NA})
automatically determines the orientation from the aesthetic mapping. In the
rare event that this fails it can be given explicitly by setting \code{orientation}
to either \code{"x"} or \code{"y"}. See the \emph{Orientation} section for more detail.}
\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped.
\code{FALSE} never includes, and \code{TRUE} always includes.
It can also be a named logical vector to finely select the aesthetics to
display.}
\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
rather than combining with them. This is most useful for helper functions
that define both data and aesthetics and shouldn't inherit behaviour from
the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.}
\item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These
arguments broadly fall into one of 4 categories below. Notably, further
arguments to the \code{position} argument, or aesthetics that are required
can \emph{not} be passed through \code{...}. Unknown arguments that are not part
of the 4 categories below are ignored.
\itemize{
\item Static aesthetics that are not mapped to a scale, but are at a fixed
value and apply to the layer as a whole. For example, \code{colour = "red"}
or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics}
section that lists the available options. The 'required' aesthetics
cannot be passed on to the \code{params}. Please note that while passing
unmapped aesthetics as vectors is technically possible, the order and
required length is not guaranteed to be parallel to the input data.
\item When constructing a layer using
a \verb{stat_*()} function, the \code{...} argument can be used to pass on
parameters to the \code{geom} part of the layer. An example of this is
\code{stat_density(geom = "area", outline.type = "both")}. The geom's
documentation lists which parameters it can accept.
\item Inversely, when constructing a layer using a
\verb{geom_*()} function, the \code{...} argument can be used to pass on parameters
to the \code{stat} part of the layer. An example of this is
\code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation
lists which parameters it can accept.
\item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through
\code{...}. This can be one of the functions described as
\link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend.
}}
}
\value{
the return value of a \verb{geom_*} function is not intended to be
directly handled by users. Therefore, \verb{geom_*} functions should never be
executed in isolation, rather used in combination with a
\code{ggplot2::ggplot()} call.
}
\description{
\code{geom_junction()} draws curves that are designed to represent junction reads
from RNA-sequencing data. It can be useful to overlay junction data on
transcript annotation (plotted using \code{geom_range()}/\code{geom_half_range()} and
\code{geom_intron()}) to understand which splicing events or transcripts have
support from RNA-sequencing data.
}
\details{
\code{geom_junction()} requires the following \code{aes()}; \code{xstart}, \code{xend} and \code{y}
(e.g. transcript name). \code{geom_junction()} curves can be modified using
\code{junction.y.max}, which can be useful when junctions overlap one
another/other transcripts or extend beyond the plot margins. By default,
junction curves will alternate between being plotted on the top and bottom of
each transcript (\code{y}), however this can be modified via
\code{junction.orientation}.
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()
# as well as a set of example (unannotated) junctions
# originating from GTEx and downloaded via the Bioconductor package snapcount
sod1_junctions
# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(
type == "exon",
transcript_name == "SOD1-201"
)
sod1_exons \%>\% head()
# add transcript_name to junctions for plotting
sod1_junctions <- sod1_junctions \%>\%
dplyr::mutate(transcript_name = "SOD1-201")
# junctions can be plotted as curves using geom_junction()
base <- sod1_junctions \%>\%
ggplot2::ggplot(ggplot2::aes(
xstart = start,
xend = end,
y = transcript_name
))
# sometimes, depending on the number and widths of transcripts and junctions
# junctions will go overlap one another or extend beyond the plot margin
base + geom_junction()
# in such cases, junction.y.max can be adjusted to modify the max y of curves
base + geom_junction(junction.y.max = 0.5)
# ncp can be used improve the smoothness of curves
base + geom_junction(junction.y.max = 0.5, ncp = 30)
# junction.orientation controls where the junction are plotted
# with respect to each transcript
# either alternating (default), or on the top or bottom
base + geom_junction(junction.orientation = "top", junction.y.max = 0.5)
base + geom_junction(junction.orientation = "bottom", junction.y.max = 0.5)
# it can be useful useful to overlay junction curves onto existing annotation
# plotted using geom_range() and geom_intron()
base <- sod1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = to_intron(sod1_exons, "transcript_name")
)
base + geom_junction(
data = sod1_junctions,
junction.y.max = 0.5
)
# as a ggplot2 extension, ggtranscript geoms inherit the
# the functionality from the parameters and aesthetics in ggplot2
# this can be useful when mapping junction thickness to their counts
base + geom_junction(
data = sod1_junctions,
aes(linewidth = mean_count),
junction.y.max = 0.5,
colour = "purple"
) +
scale_linewidth(range = c(0.1, 1))
# it can be useful to combine geom_junction() with geom_half_range()
sod1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_half_range() +
geom_intron(
data = to_intron(sod1_exons, "transcript_name")
) +
geom_junction(
data = sod1_junctions,
aes(linewidth = mean_count),
junction.y.max = 0.5,
junction.orientation = "top",
colour = "purple"
) +
scale_linewidth(range = c(0.1, 1))
}
================================================
FILE: man/geom_junction_label_repel.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/geom_junction_label_repel.R
\name{geom_junction_label_repel}
\alias{geom_junction_label_repel}
\title{Label junction curves}
\usage{
geom_junction_label_repel(
mapping = NULL,
data = NULL,
stat = "identity",
position = "identity",
parse = FALSE,
...,
junction.orientation = "alternating",
junction.y.max = 1,
angle = 90,
ncp = 15,
box.padding = 0.25,
label.padding = 0.25,
point.padding = 1e-06,
label.r = 0.15,
label.size = 0.25,
min.segment.length = 0,
arrow = NULL,
force = 1,
force_pull = 1,
max.time = 0.5,
max.iter = 10000,
max.overlaps = getOption("ggrepel.max.overlaps", default = 10),
nudge_x = 0,
nudge_y = 0,
xlim = c(NA, NA),
ylim = c(NA, NA),
na.rm = FALSE,
show.legend = NA,
direction = c("both", "y", "x"),
seed = NA,
verbose = FALSE,
inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2]{aes}} or
\code{\link[ggplot2]{aes_}}. If specified and \code{inherit.aes = TRUE} (the
default), is combined with the default mapping at the top level of the
plot. You only need to supply \code{mapping} if there isn't a mapping
defined for the plot.}
\item{data}{A data frame. If specified, overrides the default data frame
defined at the top level of the plot.}
\item{stat}{The statistical transformation to use on the data for this
layer, as a string.}
\item{position}{Position adjustment, either as a string, or the result of
a call to a position adjustment function.}
\item{parse}{If TRUE, the labels will be parsed into expressions and
displayed as described in ?plotmath}
\item{...}{other arguments passed on to \code{\link[ggplot2]{layer}}. There are
three types of arguments you can use here:
\itemize{
\item Aesthetics: to set an aesthetic to a fixed value, like
\code{colour = "red"} or \code{size = 3}.
\item Other arguments to the layer, for example you override the
default \code{stat} associated with the layer.
\item Other arguments passed on to the stat.
}}
\item{junction.orientation}{\code{character()} one of "alternating", "top" or
"bottom", specifying where the junctions will be plotted with respect to
each transcript (\code{y}).}
\item{junction.y.max}{\code{double()} the max y-value of each junction curve. It
can be useful to adjust this parameter when junction curves overlap with
one another/other transcripts or extend beyond the plot margins.}
\item{angle}{A numeric value between 0 and 180,
giving an amount to skew the control
points of the curve. Values less than 90 skew the curve towards
the start point and values greater than 90 skew the curve
towards the end point.}
\item{ncp}{The number of control points used to draw the curve.
More control points creates a smoother curve.}
\item{box.padding}{Amount of padding around bounding box, as unit or number.
Defaults to 0.25. (Default unit is lines, but other units can be specified
by passing \code{unit(x, "units")}).}
\item{label.padding}{Amount of padding around label, as unit or number.
Defaults to 0.25. (Default unit is lines, but other units can be specified
by passing \code{unit(x, "units")}).}
\item{point.padding}{Amount of padding around labeled point, as unit or
number. Defaults to 0. (Default unit is lines, but other units can be
specified by passing \code{unit(x, "units")}).}
\item{label.r}{Radius of rounded corners, as unit or number. Defaults
to 0.15. (Default unit is lines, but other units can be specified by
passing \code{unit(x, "units")}).}
\item{label.size}{Size of label border, in mm.}
\item{min.segment.length}{Skip drawing segments shorter than this, as unit or
number. Defaults to 0.5. (Default unit is lines, but other units can be
specified by passing \code{unit(x, "units")}).}
\item{arrow}{specification for arrow heads, as created by \code{\link[grid]{arrow}}}
\item{force}{Force of repulsion between overlapping text labels. Defaults
to 1.}
\item{force_pull}{Force of attraction between a text label and its
corresponding data point. Defaults to 1.}
\item{max.time}{Maximum number of seconds to try to resolve overlaps.
Defaults to 0.5.}
\item{max.iter}{Maximum number of iterations to try to resolve overlaps.
Defaults to 10000.}
\item{max.overlaps}{Exclude text labels when they overlap too many other
things. For each text label, we count how many other text labels or other
data points it overlaps, and exclude the text label if it has too many overlaps.
Defaults to 10.}
\item{nudge_x, nudge_y}{Horizontal and vertical adjustments to nudge the
starting position of each text label. The units for \code{nudge_x} and
\code{nudge_y} are the same as for the data units on the x-axis and y-axis.}
\item{xlim, ylim}{Limits for the x and y axes. Text labels will be constrained
to these limits. By default, text labels are constrained to the entire plot
area.}
\item{na.rm}{If \code{FALSE} (the default), removes missing values with
a warning. If \code{TRUE} silently removes missing values.}
\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped.
\code{FALSE} never includes, and \code{TRUE} always includes.}
\item{direction}{"both", "x", or "y" -- direction in which to adjust position of labels}
\item{seed}{Random seed passed to \code{\link[base]{set.seed}}. Defaults to
\code{NA}, which means that \code{set.seed} will not be called.}
\item{verbose}{If \code{TRUE}, some diagnostics of the repel algorithm are printed}
\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
rather than combining with them. This is most useful for helper functions
that define both data and aesthetics and shouldn't inherit behaviour from
the default plot specification, e.g. \code{\link[ggplot2]{borders}}.}
}
\value{
the return value of a \verb{geom_*} function is not intended to be
directly handled by users. Therefore, \verb{geom_*} functions should never be
executed in isolation, rather used in combination with a
\code{ggplot2::ggplot()} call.
}
\description{
\code{geom_junction_label_repel()} labels junction curves at their midpoint using
\code{ggrepel::geom_label_repel()}. This can be useful to label and compare
junctions (plotted using \code{geom_junction()}) with metrics of their usage (e.g.
read counts or percent-spliced-in).
}
\details{
\code{geom_junction_label_repel()} requires the following \code{aes()}; \code{xstart},
\code{xend}, \code{y} (e.g. transcript name) and \code{label}. Under the hood,
\code{geom_junction_label_repel()} generates the same junction curves as
\code{geom_junction()} to obtain curve midpoints for labeling. Therefore, it is
important that users use the same input data and parameters that alter
junction curves (namely \code{junction.orientation}, \code{junction.y.max}, \code{angle},
\code{ncp}) for \code{geom_junction_label_repel()} that they have used for
\code{geom_junction()}.
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()
# as well as a set of example (unannotated) junctions
# originating from GTEx and downloaded via the Bioconductor package snapcount
sod1_junctions
# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(
type == "exon",
transcript_name == "SOD1-201"
)
sod1_exons \%>\% head()
# add transcript_name to junctions for plotting
sod1_junctions <- sod1_junctions \%>\%
dplyr::mutate(transcript_name = "SOD1-201")
# geom_junction_label_repel() can be used to label junctions
base <- sod1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = to_intron(sod1_exons, "transcript_name")
)
# this can be useful to label junctions with their counts
base +
geom_junction(
data = sod1_junctions,
junction.y.max = 0.5
) +
geom_junction_label_repel(
data = sod1_junctions,
aes(label = round(mean_count, 2)),
junction.y.max = 0.5
)
}
================================================
FILE: man/geom_range.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/geom_range.R, R/geom_half_range.R
\name{geom_range}
\alias{geom_range}
\alias{geom_half_range}
\title{Plot genomic ranges}
\usage{
geom_range(
mapping = NULL,
data = NULL,
stat = "identity",
position = "identity",
...,
vjust = NULL,
linejoin = "mitre",
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE
)
geom_half_range(
mapping = NULL,
data = NULL,
stat = "identity",
position = "identity",
...,
range.orientation = "bottom",
linejoin = "mitre",
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and
\code{inherit.aes = TRUE} (the default), it is combined with the default mapping
at the top level of the plot. You must supply \code{mapping} if there is no plot
mapping.}
\item{data}{The data to be displayed in this layer. There are three
options:
If \code{NULL}, the default, the data is inherited from the plot
data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}.
A \code{data.frame}, or other object, will override the plot
data. All objects will be fortified to produce a data frame. See
\code{\link[ggplot2:fortify]{fortify()}} for which variables will be created.
A \code{function} will be called with a single argument,
the plot data. The return value must be a \code{data.frame}, and
will be used as the layer data. A \code{function} can be created
from a \code{formula} (e.g. \code{~ head(.x, 10)}).}
\item{stat}{The statistical transformation to use on the data for this layer.
When using a \verb{geom_*()} function to construct a layer, the \code{stat}
argument can be used the override the default coupling between geoms and
stats. The \code{stat} argument accepts the following:
\itemize{
\item A \code{Stat} ggproto subclass, for example \code{StatCount}.
\item A string naming the stat. To give the stat as a string, strip the
function name of the \code{stat_} prefix. For example, to use \code{stat_count()},
give the stat as \code{"count"}.
\item For more information and other ways to specify the stat, see the
\link[ggplot2:layer_stats]{layer stat} documentation.
}}
\item{position}{A position adjustment to use on the data for this layer. This
can be used in various ways, including to prevent overplotting and
improving the display. The \code{position} argument accepts the following:
\itemize{
\item The result of calling a position function, such as \code{position_jitter()}.
This method allows for passing extra arguments to the position.
\item A string naming the position adjustment. To give the position as a
string, strip the function name of the \code{position_} prefix. For example,
to use \code{position_jitter()}, give the position as \code{"jitter"}.
\item For more information and other ways to specify the position, see the
\link[ggplot2:layer_positions]{layer position} documentation.
}}
\item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These
arguments broadly fall into one of 4 categories below. Notably, further
arguments to the \code{position} argument, or aesthetics that are required
can \emph{not} be passed through \code{...}. Unknown arguments that are not part
of the 4 categories below are ignored.
\itemize{
\item Static aesthetics that are not mapped to a scale, but are at a fixed
value and apply to the layer as a whole. For example, \code{colour = "red"}
or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics}
section that lists the available options. The 'required' aesthetics
cannot be passed on to the \code{params}. Please note that while passing
unmapped aesthetics as vectors is technically possible, the order and
required length is not guaranteed to be parallel to the input data.
\item When constructing a layer using
a \verb{stat_*()} function, the \code{...} argument can be used to pass on
parameters to the \code{geom} part of the layer. An example of this is
\code{stat_density(geom = "area", outline.type = "both")}. The geom's
documentation lists which parameters it can accept.
\item Inversely, when constructing a layer using a
\verb{geom_*()} function, the \code{...} argument can be used to pass on parameters
to the \code{stat} part of the layer. An example of this is
\code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation
lists which parameters it can accept.
\item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through
\code{...}. This can be one of the functions described as
\link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend.
}}
\item{vjust}{A numeric vector specifying vertical justification.
If specified, overrides the \code{just} setting.}
\item{linejoin}{Line join style (round, mitre, bevel).}
\item{na.rm}{If \code{FALSE}, the default, missing values are removed with
a warning. If \code{TRUE}, missing values are silently removed.}
\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped.
\code{FALSE} never includes, and \code{TRUE} always includes.
It can also be a named logical vector to finely select the aesthetics to
display.}
\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
rather than combining with them. This is most useful for helper functions
that define both data and aesthetics and shouldn't inherit behaviour from
the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.}
\item{range.orientation}{\code{character()} one of "top" or "bottom", specifying
where the half ranges will be plotted with respect to each transcript
(\code{y}).}
}
\value{
the return value of a \verb{geom_*} function is not intended to be
directly handled by users. Therefore, \verb{geom_*} functions should never be
executed in isolation, rather used in combination with a
\code{ggplot2::ggplot()} call.
}
\description{
\code{geom_range()} and \code{geom_half_range()} draw tiles that are designed to
represent range-based genomic features, such as exons. In combination with
\code{geom_intron()}, these geoms form the core components for visualizing
transcript structures.
}
\details{
\code{geom_range()} and \code{geom_half_range()} require the following \code{aes()};
\code{xstart}, \code{xend} and \code{y} (e.g. transcript name). \code{geom_half_range()} takes
advantage of the vertical symmetry of transcript annotation by plotting only
half of a range on the top or bottom of a transcript structure. This can be
useful for comparing between two transcripts or free up plotting space for
other transcript annotations (e.g. \code{geom_junction()}).
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()
# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon")
sod1_exons \%>\% head()
base <- sod1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
))
# geom_range() is designed to visualise range-based annotation such as exons
base + geom_range()
# geom_half_range() allows users to plot half ranges
# on the top or bottom of the transcript
base + geom_half_range()
# where the half ranges are plotted can be adjusted using range.orientation
base + geom_half_range(range.orientation = "top")
# as a ggplot2 extension, ggtranscript geoms inherit the
# the functionality from the parameters and aesthetics in ggplot2
base + geom_range(
aes(fill = transcript_name),
linewidth = 1
)
# together, geom_range() and geom_intron() are designed to visualize
# the core components of transcript annotation
base + geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = to_intron(sod1_exons, "transcript_name")
)
# for protein coding transcripts
# geom_range() be useful for visualizing UTRs that lie outside of the CDS
sod1_exons_prot_coding <- sod1_exons \%>\%
dplyr::filter(transcript_biotype == "protein_coding")
# extract cds
sod1_cds <- sod1_annotation \%>\%
dplyr::filter(type == "CDS")
sod1_exons_prot_coding \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_cds
) +
geom_intron(
data = to_intron(sod1_exons_prot_coding, "transcript_name")
)
# geom_half_range() can be useful for comparing between two transcripts
# enabling visualization of one transcript on the top, other on the bottom
sod1_201_exons <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-201")
sod1_201_cds <- sod1_cds \%>\% dplyr::filter(transcript_name == "SOD1-201")
sod1_202_exons <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-202")
sod1_202_cds <- sod1_cds \%>\% dplyr::filter(transcript_name == "SOD1-202")
sod1_201_plot <- sod1_201_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = "SOD1-201/202"
)) +
geom_half_range(
fill = "white",
height = 0.125
) +
geom_half_range(
data = sod1_201_cds
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
)
sod1_201_plot
sod1_201_202_plot <- sod1_201_plot +
geom_half_range(
data = sod1_202_exons,
range.orientation = "top",
fill = "white",
height = 0.125
) +
geom_half_range(
data = sod1_202_cds,
range.orientation = "top",
fill = "purple"
) +
geom_intron(
data = to_intron(sod1_202_exons, "transcript_name")
)
sod1_201_202_plot
# leveraging existing ggplot2 functionality via e.g. coord_cartesian()
# can be useful to zoom in on areas of interest
sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000))
}
================================================
FILE: man/ggtranscript.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ggtranscript-package.R
\docType{package}
\name{ggtranscript}
\alias{ggtranscript-package}
\alias{ggtranscript}
\title{\code{ggtranscript}: Visualizing transcript structure and annotation using
\code{ggplot2}}
\description{
The goal of \code{ggtranscript} is the simplify the process of visualizing
transcript structure and annotation. To achieve this, \code{ggtranscript}
introduces 5 new geoms (\code{geom_range()}, \code{geom_half_range()}, \code{geom_intron()},
\code{geom_junction()} and \code{geom_junction_label_repel()}) as well as several
helper functions. As a \code{ggplot2} extension, \code{ggtranscript} inherits
\code{ggplot2}'s familiarity and flexibility, enabling users to intuitively adjust
aesthetics, parameters, scales etc as well as complement \code{ggtranscript} geoms
with existing \code{ggplot2} geoms to create informative, publication-ready plots.
}
\seealso{
Useful links:
\itemize{
\item \url{https://github.com/dzhang32/ggtranscript}
\item Report bugs at \url{https://github.com/dzhang32/ggtranscript/issues}
}
}
\author{
\strong{Maintainer}: David Zhang \email{dyzhang32@gmail.com} (\href{https://orcid.org/0000-0003-2382-8460}{ORCID})
Authors:
\itemize{
\item Emil Gustavsson \email{e.gustavsson@ucl.ac.uk} (\href{https://orcid.org/0000-0003-0541-7537}{ORCID})
}
Other contributors:
\itemize{
\item Regina Reynolds \email{regina.reynolds.16@ucl.ac.uk} (\href{https://orcid.org/0000-0001-6470-7919}{ORCID}) [contributor]
\item Sonia Ruiz \email{s.ruiz@ucl.ac.uk} [contributor]
}
}
================================================
FILE: man/shorten_gaps.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/shorten_gaps.R
\name{shorten_gaps}
\alias{shorten_gaps}
\title{Improve transcript structure visualization by shortening gaps}
\usage{
shorten_gaps(exons, introns, group_var = NULL, target_gap_width = 100L)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}
\item{introns}{\code{data.frame()} the intron co-ordinates corresponding to the
input \code{exons}. This can be created by applying \code{to_intron()} to the
\code{exons}. If introns originate from multiple transcripts, they must be
differentiated using \code{group_var}. If a user is not using \code{to_intron()},
they must make sure intron start/ends are defined precisely as the adjacent
exon boundaries (rather than exon end + 1 and exon start - 1).}
\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
\item{target_gap_width}{\code{integer()} the width in base pairs to shorten the
gaps to.}
}
\value{
\code{data.frame()} contains the re-scaled co-ordinates of \code{introns} and
\code{exons} of each input transcript with shortened gaps.
}
\description{
For a given set of exons and introns, \code{shorten_gaps()} reduces the width of
gaps (regions that do not overlap any \code{exons}) to a user-inputted
\code{target_gap_width}. This can be useful when visualizing transcripts that have
long introns, to hone in on the regions of interest (i.e. exons) and better
compare between transcript structures.
}
\details{
After \code{shorten_gaps()} reduces the size of gaps, it will re-scale \code{exons} and
\code{introns} to preserve exon alignment. This process will only reduce the width
of input \code{introns}, never \code{exons}. Importantly, the outputted re-scaled
co-ordinates should only be used for visualization as they will not match the
original genomic coordinates.
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
pknox1_annotation \%>\% head()
# extract exons
pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon")
pknox1_exons \%>\% head()
# to_intron() is a helper function included in ggtranscript
# which is useful for converting exon co-ordinates to introns
pknox1_introns <- pknox1_exons \%>\% to_intron(group_var = "transcript_name")
pknox1_introns \%>\% head()
# for transcripts with long introns, the exons of interest
# can be difficult to visualize clearly when using the default scale
pknox1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = pknox1_introns,
arrow.min.intron.length = 3500
)
# in such cases it can be useful to rescale the exons and introns
# using shorten_gaps() which shortens regions that do not overlap an exon
pknox1_rescaled <-
shorten_gaps(pknox1_exons, pknox1_introns, group_var = "transcript_name")
pknox1_rescaled \%>\% head()
# this allows us to visualize differences in exonic structure more clearly
pknox1_rescaled \%>\%
dplyr::filter(type == "exon") \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = pknox1_rescaled \%>\% dplyr::filter(type == "intron"),
arrow.min.intron.length = 300
)
# shorten_gaps() can be used in combination with to_diff()
# to further highlight differences in exon structure
# here, all other transcripts are compared to the MANE-select transcript
pknox1_rescaled_diffs <- to_diff(
exons = pknox1_rescaled \%>\%
dplyr::filter(type == "exon", transcript_name != "PKNOX1-201"),
ref_exons = pknox1_rescaled \%>\%
dplyr::filter(type == "exon", transcript_name == "PKNOX1-201"),
group_var = "transcript_name"
)
pknox1_rescaled \%>\%
dplyr::filter(type == "exon") \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = pknox1_rescaled \%>\% dplyr::filter(type == "intron"),
arrow.min.intron.length = 300
) +
geom_range(
data = pknox1_rescaled_diffs,
aes(fill = diff_type),
alpha = 0.2
)
}
================================================
FILE: man/sod1_annotation.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{sod1_annotation}
\alias{sod1_annotation}
\alias{pknox1_annotation}
\title{Example transcript annotation}
\format{
A \code{tibble::tibble()}:
\describe{
\item{seqnames}{\code{factor()} chromosome.}
\item{start}{\code{integer()} start position.}
\item{end}{\code{integer()} end position.}
\item{strand}{\code{factor()} strand.}
\item{type}{\code{factor()} E.g.gene, transcript, exon or CDS.}
\item{gene_name}{\code{character()} name of gene (GBA).}
\item{transcript_name}{\code{character()} name of transcript.}
\item{transcript_biotype}{\code{character()} biotype of transcript.}
}
An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 112 rows and 8 columns.
}
\source{
generated using \code{ggtranscript/data-raw/sod1_pknox1_annotation.R}
}
\usage{
sod1_annotation
pknox1_annotation
}
\description{
Transcript annotation including the co-ordinates (hg38) of the genes,
transcripts, exons and CDS regions for \emph{SOD1} and \emph{PKNOX1}, which
originate from version 105 of the Ensembl reference annotation.
}
\keyword{datasets}
================================================
FILE: man/sod1_junctions.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{sod1_junctions}
\alias{sod1_junctions}
\title{Example junctions}
\format{
A \code{tibble::tibble()}:
\describe{
\item{seqnames}{\code{factor()} chromosome.}
\item{start}{\code{integer()} start position.}
\item{end}{\code{integer()} end position.}
\item{strand}{\code{factor()} strand.}
\item{mean_count}{\code{factor()} Average count across all GTEx liver samples.}
}
}
\source{
generated using \code{ggtranscript/data-raw/sod1_junctions.R}
}
\usage{
sod1_junctions
}
\description{
Junction co-ordinates and counts associated with the \emph{SOD1} gene.
Junctions counts originate from GTEx liver samples and are downloaded via the
Bioconductor package \code{snapcount}. Only unannotated junctions with a mean
count above 0.3 have been retained for this example.
}
\keyword{datasets}
================================================
FILE: man/to_diff.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/to_diff.R
\name{to_diff}
\alias{to_diff}
\title{Obtain the differences between transcript structure}
\usage{
to_diff(exons, ref_exons, group_var = NULL)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}
\item{ref_exons}{\code{data.frame()} contains exons that originate from a single
transcript, which \code{exons} will be compared against.}
\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
}
\value{
\code{data.frame()} details the differences between \code{exons} and
\code{ref_exons}.
}
\description{
\code{to_diff()} obtains the difference between \code{exons} from a set of transcripts
to a reference transcript (\code{ref_exons}). This can be useful when visualizing
the differences between transcript structure. \code{to_diff()} expects two sets of
input exons; 1. \code{exons} - exons from any number of transcripts that will be
compared to \code{ref_exons} and 2. \code{ref_exons} - exons from a single transcript
which acts as the reference to compare against.
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()
# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon")
sod1_exons \%>\% head()
# for this example, let's compare transcripts to the MANE-select transcript
sod1_mane <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-201")
sod1_not_mane <- sod1_exons \%>\% dplyr::filter(transcript_name != "SOD1-201")
# to_diff() obtains the differences between the exons as ranges
sod1_diffs <- to_diff(
exons = sod1_not_mane,
ref_exons = sod1_mane,
group_var = "transcript_name"
)
sod1_diffs \%>\% head()
# using geom_range(), it can be useful to visually overlay
# the differences on top of the transcript annotation
sod1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = to_intron(sod1_exons, "transcript_name")
) +
geom_range(
data = sod1_diffs,
ggplot2::aes(fill = diff_type),
alpha = 0.2
)
}
================================================
FILE: man/to_intron.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/to_intron.R
\name{to_intron}
\alias{to_intron}
\title{Convert exon co-ordinates to introns}
\usage{
to_intron(exons, group_var = NULL)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}
\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
}
\value{
\code{data.frame()} contains the intron co-ordinates.
}
\description{
Given a set of \code{exons}, \code{to_intron()} will return the corresponding introns.
}
\details{
It is important to note that, for visualization purposes, \code{to_intron()}
defines introns precisely as the exon boundaries, rather than the intron
start/end being (exon end + 1)/(exon start - 1).
}
\examples{
library(magrittr)
library(ggplot2)
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()
# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon")
sod1_exons \%>\% head()
# to_intron() is a helper function included in ggtranscript
# which is useful for converting exon co-ordinates to introns
sod1_introns <- sod1_exons \%>\% to_intron(group_var = "transcript_name")
sod1_introns \%>\% head()
# this can be particular useful when combined with
# geom_range() and geom_intron()
# to visualize the core components of transcript annotation
sod1_exons \%>\%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = to_intron(sod1_exons, "transcript_name")
)
}
================================================
FILE: tests/testthat/test-add_exon_number.R
================================================
sod1_exons <- sod1_annotation %>%
dplyr::filter(type == "exon")
# create dummy transcripts with both positive and minus strand
# purely for testing strand functionality
test_exons <- sod1_exons %>%
dplyr::filter(transcript_name == "SOD1-202") %>%
dplyr::mutate(strand = "-") %>%
dplyr::bind_rows(
sod1_exons %>%
dplyr::filter(transcript_name == "SOD1-201")
)
##### add_exon_number #####
testthat::test_that("add_exon_number() works correctly", {
test_exon_number <- test_exons %>%
add_exon_number(group_var = "transcript_name")
test_exon_number_plus <- test_exon_number %>%
dplyr::filter(strand == "+")
test_exon_number_minus <- test_exon_number %>%
dplyr::filter(strand == "-")
expect_true("exon_number" %in% colnames(test_exon_number))
expect_true(is.numeric(test_exon_number[["exon_number"]]))
expect_equal(
test_exon_number_plus[["exon_number"]],
seq_len(nrow(test_exon_number_plus))
)
expect_equal(
test_exon_number_minus[["exon_number"]],
seq_len(nrow(test_exon_number_minus)) %>% rev()
)
# check order makes no difference
set.seed(32)
expect_equal(
test_exons[sample(seq_len(nrow(test_exons)), nrow(test_exons)), ] %>%
add_exon_number(group_var = "transcript_name"),
test_exon_number
)
})
testthat::test_that("add_exon_number(group_var = NULL) works correctly", {
test_exon_number_plus <- test_exons %>%
dplyr::filter(strand == "+") %>%
add_exon_number(group_var = NULL)
test_exon_number_minus <- test_exons %>%
dplyr::filter(strand == "-") %>%
add_exon_number(group_var = NULL)
expect_equal(
test_exon_number_plus[["exon_number"]],
seq_len(nrow(test_exon_number_plus))
)
expect_equal(
test_exon_number_minus[["exon_number"]],
seq_len(nrow(test_exon_number_minus)) %>% rev()
)
})
================================================
FILE: tests/testthat/test-add_utr.R
================================================
pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
pknox1_cds <- pknox1_annotation %>% dplyr::filter(type == "CDS")
pknox1_utr <- pknox1_annotation %>% dplyr::filter(grepl("utr", type))
##### add_utr #####
# add 3 bp to end of cds as stop codon not included in ensembl cds
pknox1_cds_w_stop <- pknox1_cds %>%
dplyr::group_by(transcript_name) %>%
dplyr::mutate(
end = ifelse(end == max(end), end + 3, end)
) %>%
dplyr::ungroup()
pknox1_cds_utr <- add_utr(
pknox1_exons,
pknox1_cds_w_stop,
group_var = "transcript_name"
)
pknox1_cds_utr_1_tx <- add_utr(
pknox1_exons %>% dplyr::filter(transcript_name == "PKNOX1-203"),
pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203"),
group_var = "transcript_name"
)
pknox1_cds_utr_1_tx_no_group <- add_utr(
pknox1_exons %>% dplyr::filter(transcript_name == "PKNOX1-203"),
pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203"),
group_var = NULL
)
test_add_utrs <- function(cds_utr_add_utr, utr_annotation, cds_annotation) {
utr_add_utr <- cds_utr_add_utr %>%
dplyr::filter(type == "UTR") %>%
dplyr::select(start, end) %>%
dplyr::arrange(start, end)
cds_add_utr <- cds_utr_add_utr %>%
dplyr::filter(type == "CDS") %>%
dplyr::select(start, end) %>%
dplyr::arrange(start, end)
utr_annotation <- utr_annotation %>%
dplyr::select(start, end) %>%
dplyr::arrange(start, end)
cds_annotation <- cds_annotation %>%
dplyr::select(start, end) %>%
dplyr::arrange(start, end)
no_na_type <- all(!is.na(cds_utr_add_utr[["type"]]))
no_dummy_group <- is.null(cds_utr_add_utr[["dummy_group"]])
correct_utrs <- all.equal(utr_add_utr, utr_annotation)
correct_cds <- all.equal(cds_add_utr, cds_annotation)
check_add_utr <- all(no_na_type, no_dummy_group, correct_utrs, correct_cds)
return(check_add_utr)
}
testthat::test_that(
"add_utr() works correctly",
{
expect_true(test_add_utrs(pknox1_cds_utr, pknox1_utr, pknox1_cds_w_stop))
expect_true(test_add_utrs(
pknox1_cds_utr_1_tx,
pknox1_utr %>% dplyr::filter(transcript_name == "PKNOX1-203"),
pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203")
))
expect_true(test_add_utrs(
pknox1_cds_utr_1_tx_no_group,
pknox1_utr %>% dplyr::filter(transcript_name == "PKNOX1-203"),
pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203")
))
}
)
##### add_utr & shorten_gaps #####
pknox1_cds_utr_rescaled <-
shorten_gaps(
exons = pknox1_cds_utr,
introns = to_intron(pknox1_cds_utr, "transcript_name"),
group_var = "transcript_name"
)
# add labels helps manual checking
plot_before_after_rescaled <- function(cds_utr_before,
cds_utr_after,
group_var,
add_labels = FALSE) {
before_rescaling <- cds_utr_before %>%
dplyr::filter(type == "CDS") %>%
ggplot2::ggplot(ggplot2::aes(
xstart = start,
xend = end,
y = .data[[group_var]]
)) +
geom_range() +
geom_range(
data = cds_utr_before %>% dplyr::filter(type == "UTR"),
height = 0.25,
fill = "white"
) +
geom_intron(
data = to_intron(cds_utr_before, "transcript_name"),
)
after_rescaling <- cds_utr_after %>%
dplyr::filter(type == "CDS") %>%
ggplot2::ggplot(ggplot2::aes(
xstart = start,
xend = end,
y = .data[[group_var]]
)) +
geom_range() +
geom_range(
data = cds_utr_after %>% dplyr::filter(type == "UTR"),
height = 0.25,
fill = "white"
) +
geom_intron(
data = to_intron(
cds_utr_after %>%
dplyr::filter(type != "intron"),
"transcript_name"
),
)
before_after_list <- list(before_rescaling, after_rescaling)
if (add_labels) {
for (i in seq_len(length(before_after_list))) {
before_after_list[[i]] <- before_after_list[[i]] +
ggrepel::geom_label_repel(
ggplot2::aes_string(
x = "start",
y = group_var,
label = "start"
),
min.segment.length = 0
)
}
}
before_after_plot <- ggpubr::ggarrange(
plotlist = before_after_list, nrow = 2
)
return(before_after_plot)
}
testthat::test_that(
"shorten_gaps works correctly",
{
test_rescaled_w_utr_plot <- plot_before_after_rescaled(
pknox1_cds_utr,
pknox1_cds_utr_rescaled,
group_var = "transcript_name",
add_labels = FALSE
)
vdiffr::expect_doppelganger(
"test rescaled with utr plot",
test_rescaled_w_utr_plot
)
}
)
================================================
FILE: tests/testthat/test-geom_half_range.R
================================================
# create dummy exons for testing
test_exons <-
dplyr::tibble(
start = c(100, 300, 500, 650),
end = start + 100,
strand = c("+", "+", "-", "-"),
tx = c("A", "A", "B", "B")
)
# create base plot to be used in downstream tests
test_exons_plot <- test_exons %>%
ggplot2::ggplot(aes(
xstart = start,
xend = end,
y = tx
))
##### geom_half_range #####
testthat::test_that(
"geom_half_range() works correctly",
{
base_geom_half_range <- test_exons_plot +
geom_half_range()
w_param_geom_half_range <- test_exons_plot +
geom_half_range(colour = "red", fill = "blue")
w_aes_geom_half_range <- test_exons_plot +
geom_half_range(aes(fill = tx))
w_facet_geom_half_range <- test_exons_plot +
geom_half_range() +
ggplot2::facet_wrap(~tx)
vdiffr::expect_doppelganger(
"Base geom_half_range plot",
base_geom_half_range
)
vdiffr::expect_doppelganger(
"With param geom_half_range plot",
w_param_geom_half_range
)
vdiffr::expect_doppelganger(
"With aes geom_half_range plot",
w_aes_geom_half_range
)
vdiffr::expect_doppelganger(
"With facet geom_half_range plot",
w_facet_geom_half_range
)
}
)
testthat::test_that(
"geom_half_range(range.orientation = x) works correctly",
{
w_top_geom_half_range <- test_exons_plot +
geom_half_range(range.orientation = "top")
w_both_geom_half_range <- test_exons_plot +
geom_half_range(range.orientation = "top", fill = "red") +
geom_half_range(range.orientation = "bottom", fill = "blue")
vdiffr::expect_doppelganger(
"With top geom_half_range plot",
w_top_geom_half_range
)
vdiffr::expect_doppelganger(
"With both geom_half_range plot",
w_both_geom_half_range
)
}
)
testthat::test_that(
"geom_half_range() catches user input errors",
{
a_range.orientation <- test_exons_plot +
geom_half_range(range.orientation = "a")
expect_error(
print(a_range.orientation),
"range.orientation must be one of"
)
}
)
================================================
FILE: tests/testthat/test-geom_intron.R
================================================
test_introns <-
dplyr::tibble(
strand = c("+", "-"),
tx = c("A", "B"),
start = c(201, 601),
end = c(299, 649),
type = "intron"
)
# create base plot to be used in downstream tests
test_introns_plot <- test_introns %>%
ggplot2::ggplot(aes(
xstart = start,
xend = end,
y = tx
))
##### geom_intron #####
testthat::test_that(
"geom_intron() works correctly",
{
base_geom_intron <- test_introns_plot +
geom_intron()
w_param_geom_intron <- test_introns_plot +
geom_intron(colour = "blue", linewidth = 2)
w_aes_geom_intron <- test_introns_plot +
geom_intron(aes(colour = tx, linewidth = c(1L, 2L)))
w_facet_geom_intron <- test_introns_plot +
geom_intron() +
ggplot2::facet_wrap(~tx)
vdiffr::expect_doppelganger(
"Base geom_intron plot",
base_geom_intron
)
vdiffr::expect_doppelganger(
"With param geom_intron plot",
w_param_geom_intron
)
vdiffr::expect_doppelganger(
"With aes geom_intron plot",
w_aes_geom_intron
)
vdiffr::expect_doppelganger(
"With facet geom_intron plot",
w_facet_geom_intron
)
}
)
testthat::test_that(
"geom_intron(strand = x) works correctly",
{
minus_strand <- test_introns_plot +
geom_intron(strand = "-")
factor_strand <- test_introns_plot +
geom_intron(strand = factor("-"))
as_aes_strand <- test_introns_plot +
geom_intron(aes(strand = strand))
vdiffr::expect_doppelganger(
"Minus strand plot",
minus_strand
)
vdiffr::expect_doppelganger(
"factor strand plot",
factor_strand
)
vdiffr::expect_doppelganger(
"As aes strand plot",
as_aes_strand
)
}
)
testthat::test_that(
"geom_intron(arrow.min.intron.length = x) works correctly",
{
base_arrow.min <- test_introns_plot +
geom_intron(arrow.min.intron.length = 50)
w_strand_arrow_min <- test_introns_plot +
geom_intron(arrow.min.intron.length = 50, strand = "-")
vdiffr::expect_doppelganger(
"base arrow.min plot",
base_arrow.min
)
vdiffr::expect_doppelganger(
"with strand arrow.min plot",
w_strand_arrow_min
)
}
)
testthat::test_that(
"geom_intron() catches strand input errors",
{
na_strand <- test_introns_plot +
geom_intron(strand = c(NA, rep("+", nrow(test_introns) - 1)))
a_strand <- test_introns_plot +
geom_intron(strand = "a")
int_strand <- test_introns_plot +
geom_intron(aes(strand = start))
# seems to require print to catch error
expect_error(
print(na_strand),
"strand values must be one of"
)
expect_error(
print(a_strand),
"strand values must be one of"
)
expect_error(
print(int_strand),
"strand values must be one of"
)
}
)
testthat::test_that(
"geom_intron() catches arrow.min.intron.length input errors",
{
neg_arrow.min <- test_introns_plot +
geom_intron(arrow.min.intron.length = -1)
chr_arrow.min <- test_introns_plot +
geom_intron(arrow.min.intron.length = "1")
# seems to require print to catch error
expect_error(
print(neg_arrow.min),
"arrow.min.intron.length must be "
)
expect_error(
print(chr_arrow.min),
"arrow.min.intron.length must be "
)
}
)
================================================
FILE: tests/testthat/test-geom_junction.R
================================================
# manually create the expected introns
test_introns <-
sod1_annotation %>%
dplyr::filter(type == "exon") %>%
to_intron(group_var = "transcript_name")
# create base plot to be used in downstream tests
test_introns_plot <- test_introns %>%
ggplot2::ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
))
##### geom_junction #####
testthat::test_that(
"geom_junction() works correctly",
{
base_geom_junction <- test_introns_plot +
geom_junction()
w_param_geom_junction <- test_introns_plot +
geom_junction(colour = "red")
w_aes_geom_junction <- test_introns_plot +
geom_junction(aes(colour = transcript_name))
w_facet_geom_junction <- test_introns_plot +
geom_junction() +
ggplot2::facet_wrap(~transcript_biotype)
vdiffr::expect_doppelganger(
"Base geom_junction plot",
base_geom_junction
)
vdiffr::expect_doppelganger(
"With param geom_junction plot",
w_param_geom_junction
)
vdiffr::expect_doppelganger(
"With aes geom_junction plot",
w_aes_geom_junction
)
vdiffr::expect_doppelganger(
"With facet geom_junction plot",
w_facet_geom_junction
)
}
)
testthat::test_that(
"geom_junction(junction.orientation = x) works correctly",
{
top_junction.orientation <- test_introns_plot +
geom_junction(junction.orientation = "top")
bottom_junction.orientation <- test_introns_plot +
geom_junction(junction.orientation = "bottom")
w_aes_param_top_junction.orientation <- test_introns_plot +
geom_junction(
aes(colour = transcript_name),
linewidth = 1,
junction.orientation = "top"
)
vdiffr::expect_doppelganger(
"top junction.orientation plot",
top_junction.orientation
)
vdiffr::expect_doppelganger(
"bottom junction.orientation plot",
bottom_junction.orientation
)
vdiffr::expect_doppelganger(
"with aes and param top junction.orientation plot",
w_aes_param_top_junction.orientation
)
}
)
testthat::test_that(
"geom_junction(junction.y.max = x) works correctly",
{
junction.y.max_0.5 <- test_introns_plot +
geom_junction(junction.y.max = 0.5)
w_aes_param_junction.y.max_0.5 <- test_introns_plot +
geom_junction(
aes(colour = transcript_name),
linewidth = 1,
junction.y.max = 0.5
)
w_facet_junction.y.max_0.5 <- test_introns_plot +
geom_junction(junction.y.max = 0.5) +
ggplot2::facet_wrap(~transcript_biotype)
vdiffr::expect_doppelganger(
"0.5 junction.y.max plot",
junction.y.max_0.5
)
vdiffr::expect_doppelganger(
"with aes and param 0.5 junction.y.max plot",
w_aes_param_junction.y.max_0.5
)
vdiffr::expect_doppelganger(
"with facet 0.5 junction.y.max plot",
w_facet_junction.y.max_0.5
)
}
)
testthat::test_that(
"geom_junction() catches junction.orientation input errors",
{
a_junction.orientation <- test_introns_plot +
geom_junction(junction.orientation = "a")
expect_error(
print(a_junction.orientation),
"junction.orientation must be one of"
)
}
)
testthat::test_that(
"geom_junction() catches junction.y.max input errors",
{
len_2_junction.y.max <- test_introns_plot +
geom_junction(junction.y.max = c(1, 2))
a_junction.y.max <- test_introns_plot +
geom_junction(junction.y.max = "a")
expect_error(
print(len_2_junction.y.max),
"junction.y.max must have a length of 1"
)
expect_error(
print(a_junction.y.max),
"junction.y.max must be a numeric value"
)
}
)
================================================
FILE: tests/testthat/test-geom_junction_label_repel.R
================================================
# manually create the expected introns
test_introns <-
sod1_annotation %>%
dplyr::filter(
type == "exon",
transcript_name %in% c("SOD1-201", "SOD1-202")
) %>%
to_intron(group_var = "transcript_name") %>%
dplyr::mutate(
count = dplyr::row_number()
)
# create base plot to be used in downstream tests
test_introns_plot <- test_introns %>%
ggplot2::ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
))
##### geom_junction_label_repel #####
testthat::test_that(
"geom_junction() works correctly",
{
base_geom_junction_labels <- test_introns_plot +
geom_junction() +
geom_junction_label_repel(
aes(label = count),
seed = 32
)
w_param_geom_junction_labels <- test_introns_plot +
geom_junction(
junction.y.max = 0.5
) +
geom_junction_label_repel(
aes(label = count),
junction.y.max = 0.5,
seed = 32
)
w_aes_geom_junction_labels <- test_introns_plot +
geom_junction(aes(colour = transcript_name)) +
geom_junction_label_repel(
aes(
label = count,
colour = transcript_name
),
seed = 32
)
w_facet_geom_junction_labels <- test_introns_plot +
geom_junction() +
geom_junction_label_repel(
aes(label = count),
seed = 32
) +
ggplot2::facet_wrap(transcript_name ~ ., drop = TRUE)
vdiffr::expect_doppelganger(
"Base geom_junction_label_repel plot",
base_geom_junction_labels
)
vdiffr::expect_doppelganger(
"With param geom_junction_label_repel plot",
w_param_geom_junction_labels
)
vdiffr::expect_doppelganger(
"With aes geom_junction_label_repel plot",
w_aes_geom_junction_labels
)
vdiffr::expect_doppelganger(
"With facet geom_junction_label_repel plot",
w_facet_geom_junction_labels
)
}
)
================================================
FILE: tests/testthat/test-geom_range.R
================================================
# create dummy exons for testing
test_exons <-
dplyr::tibble(
start = c(100, 300, 500, 650),
end = start + 100,
strand = c("+", "+", "-", "-"),
tx = c("A", "A", "B", "B")
)
# create base plot to be used in downstream tests
test_exons_plot <- test_exons %>%
ggplot2::ggplot(aes(
xstart = start,
xend = end,
y = tx
))
##### geom_range #####
testthat::test_that(
"geom_range() works correctly",
{
base_geom_range <- test_exons_plot +
geom_range()
w_param_geom_range <- test_exons_plot +
geom_range(colour = "red", fill = "blue")
w_aes_geom_range <- test_exons_plot +
geom_range(aes(fill = tx))
w_facet_geom_range <- test_exons_plot +
geom_range() +
ggplot2::facet_wrap(~tx)
vdiffr::expect_doppelganger(
"Base geom_range plot",
geom_range
)
vdiffr::expect_doppelganger(
"With param geom_range plot",
w_param_geom_range
)
vdiffr::expect_doppelganger(
"With aes geom_range plot",
w_aes_geom_range
)
vdiffr::expect_doppelganger(
"With facet geom_range plot",
w_facet_geom_range
)
}
)
testthat::test_that(
"geom_range(vjust = x) works correctly",
{
w_vjust_geom_range <- test_exons_plot +
geom_range(vjust = 1.5, height = 0.25)
vdiffr::expect_doppelganger(
"With vjust geom_range plot",
w_vjust_geom_range
)
}
)
================================================
FILE: tests/testthat/test-shorten_gaps.R
================================================
test_exons <-
dplyr::tibble(
seqnames = "1",
start = c(100, 300, 500, 650),
end = start + 100,
strand = "+",
tx = c("A", "A", "B", "B")
)
pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
pknox1_introns <- pknox1_exons %>%
to_intron("transcript_name")
##### .get_gaps #####
# need to create gaps globally for downstream tests
pknox1_intron_gaps <- .get_gaps(GenomicRanges::GRanges(pknox1_exons))
test_intron_gaps <- .get_gaps(GenomicRanges::GRanges(test_exons))
test_.get_gaps <- function(exons, intron_gaps) {
# intron_gaps should not overlap any exons
exons_gap_hits <- GenomicRanges::findOverlaps(
GenomicRanges::GRanges(exons),
intron_gaps
)
overlap_exons <- length(exons_gap_hits) == 0
return(overlap_exons)
}
testthat::test_that(".get_gaps() works correctly", {
expect_true(test_.get_gaps(
pknox1_exons, pknox1_intron_gaps
))
expect_true(test_.get_gaps(
test_exons, test_intron_gaps
))
})
##### .get_tx_start_gaps #####
pknox1_tx_start_gaps <-
.get_tx_start_gaps(pknox1_exons, "transcript_name")
test_exons_tx_start_gaps <-
.get_tx_start_gaps(test_exons, NULL)
test_.get_tx_start_gaps <- function(exons, tx_start_gaps, group_var) {
unique_start <- length(unique(tx_start_gaps[["start"]])) == 1
correct_start <- all(tx_start_gaps[["start"]] == min(exons[["start"]]))
correct_end <- exons %>%
dplyr::group_by_at(.vars = group_var) %>%
dplyr::summarise(tx_start = min(start))
correct_end <- all(tx_start_gaps[["end"]] == correct_end[["tx_start"]])
correct_all <- all(
unique_start, correct_start, correct_end
)
return(correct_all)
}
testthat::test_that(".get_tx_start_gaps() works correctly", {
expect_true(test_.get_tx_start_gaps(
pknox1_exons,
pknox1_tx_start_gaps,
"transcript_name"
))
expect_true(test_.get_tx_start_gaps(
test_exons,
test_exons_tx_start_gaps,
NULL
))
})
##### .check_len_1_strand_seqnames #####
testthat::test_that(
".check_len_1_strand_seqnames() catches user input errors",
{
expect_error(
.check_len_1_strand_seqnames(1:2, 1),
"seqnames of object contains more than 1 unique value"
)
expect_error(
.check_len_1_strand_seqnames(1, 1:2),
"strand of object contains more than 1 unique value"
)
}
)
##### .check_type #####
testthat::test_that(".get_type() works correctly", {
added_type_exons <- pknox1_exons %>%
dplyr::select(-type) %>%
.get_type("exons") %>%
.[["type"]]
added_type_introns <- pknox1_exons %>%
dplyr::select(-type) %>%
.get_type("introns") %>%
.[["type"]]
expect_true(
all(added_type_exons == "exon")
)
expect_true(
all(added_type_introns == "intron")
)
expect_identical(
.get_type(pknox1_exons, "exons"),
pknox1_exons
)
expect_identical(
.get_type(pknox1_introns, "introns"),
pknox1_introns
)
})
testthat::test_that(".get_type() catches user input errors", {
expect_error(
.get_type(pknox1_exons, "introns"),
"values in the 'type' column of introns must be one of:"
)
})
##### .check_target_gap_width #####
testthat::test_that(".check_target_gap_width() catches user input errors", {
expect_warning(
.check_target_gap_width(100),
"target_gap_width must be an integer, coercing"
)
})
##### shorten_gaps #####
# also using this to test drop_orig_coords
pknox1_rescaled_tx <- shorten_gaps(
pknox1_exons,
pknox1_introns,
group_var = "transcript_name",
target_gap_width = 100L
)
pknox1_exons_1_tx <- pknox1_exons %>%
dplyr::filter(transcript_name == "PKNOX1-202")
pknox1_introns_1_tx <- pknox1_introns %>%
dplyr::filter(transcript_name == "PKNOX1-202")
pknox1_rescaled_1_tx <- shorten_gaps(
pknox1_exons_1_tx,
pknox1_introns_1_tx,
group_var = "transcript_name",
target_gap_width = 100L
)
pknox1_rescaled_1_tx_no_group <- shorten_gaps(
pknox1_exons_1_tx,
pknox1_introns_1_tx,
group_var = NULL,
target_gap_width = 100L
)
test_rescaled_tx <- shorten_gaps(
test_exons,
to_intron(test_exons, "tx"),
group_var = "tx",
target_gap_width = 50L
)
testthat::test_that("shorten_gaps() keeps existing columns", {
expect_true(!is.null(pknox1_rescaled_tx[["transcript_biotype"]]))
expect_true(!is.null(
pknox1_rescaled_1_tx_no_group[["transcript_biotype"]]
))
})
testthat::test_that("shorten_gaps() takes user inputted type", {
# for test, we modify all exons types to "utr"
all_utr <- shorten_gaps(
pknox1_exons %>% dplyr::mutate(type = "utr"),
pknox1_introns,
group_var = "transcript_name",
target_gap_width = 100L
)
expect_true(all(pknox1_rescaled_tx[["type"]] %in% c("exon", "intron")))
expect_true(all(all_utr[["type"]] %in% c("utr", "intron")))
})
test_shorten_gaps <- function(exons, rescaled_tx) {
# should never shorten exons
exon_widths_before <- exons[["end"]] - exons[["start"]]
exon_widths_after <- rescaled_tx %>%
dplyr::filter(type == "exon") %>%
dplyr::mutate(width = end - start) %>%
.[["width"]]
unchanged_exon_widths <- all.equal(
sort(exon_widths_before), sort(exon_widths_after)
)
return(unchanged_exon_widths)
}
testthat::test_that("shorten_gaps() never modifies exons", {
expect_true(test_shorten_gaps(
pknox1_exons,
pknox1_rescaled_tx
))
expect_true(test_shorten_gaps(
pknox1_exons_1_tx,
pknox1_rescaled_1_tx
))
expect_true(test_shorten_gaps(
pknox1_exons_1_tx,
pknox1_rescaled_1_tx_no_group
))
expect_true(test_shorten_gaps(
test_exons,
test_rescaled_tx
))
})
# add labels helps manual checking
plot_rescaled_tx <- function(exons,
rescaled_tx,
group_var,
add_labels = FALSE) {
before_rescaling <- exons %>%
ggplot2::ggplot(ggplot2::aes(
xstart = start,
xend = end,
y = .data[[group_var]]
)) +
geom_range() +
geom_intron(
data = to_intron(exons, group_var),
strand = "-",
arrow.min.intron.length = 500
)
after_rescaling <- rescaled_tx %>%
dplyr::filter(type == "exon") %>%
ggplot2::ggplot(ggplot2::aes(
xstart = start,
xend = end,
y = .data[[group_var]]
)) +
geom_range() +
geom_intron(
data = rescaled_tx %>%
dplyr::filter(type == "intron"),
strand = "-",
arrow.min.intron.length = 500
)
before_after_list <- list(before_rescaling, after_rescaling)
if (add_labels) {
for (i in seq_len(length(before_after_list))) {
before_after_list[[i]] <- before_after_list[[i]] +
ggrepel::geom_label_repel(
ggplot2::aes(
x = end,
y = .data[[group_var]],
label = end
),
linewidth = 2,
min.segment.length = 0
)
}
}
before_after_plot <- ggpubr::ggarrange(
plotlist = before_after_list, nrow = 2
)
return(before_after_plot)
}
testthat::test_that(
"shorten_gaps works correctly",
{
test_rescaled_plot <- plot_rescaled_tx(
test_exons, test_rescaled_tx, "tx"
)
pknox1_rescaled_plot <- plot_rescaled_tx(
pknox1_exons, pknox1_rescaled_tx, "transcript_name"
)
pknox1_rescaled_plot_1_tx <- plot_rescaled_tx(
pknox1_exons_1_tx, pknox1_rescaled_1_tx, "transcript_name"
)
# make sure everything works okay even if group is set to NULL
pknox1_plot_1_tx_no_group <- plot_rescaled_tx(
pknox1_exons_1_tx,
pknox1_rescaled_1_tx_no_group,
"transcript_name"
)
vdiffr::expect_doppelganger(
"test exons rescaled plot",
test_rescaled_plot
)
vdiffr::expect_doppelganger(
"pknox1 rescaled plot",
pknox1_rescaled_plot
)
vdiffr::expect_doppelganger(
"pknox1 rescaled plot 1 tx",
pknox1_rescaled_plot_1_tx
)
vdiffr::expect_doppelganger(
"pknox1 rescaled plot 1 tx no group",
pknox1_plot_1_tx_no_group
)
}
)
================================================
FILE: tests/testthat/test-to_diff.R
================================================
sod1_exons <- sod1_annotation %>%
dplyr::filter(type == "exon")
mane <- sod1_exons %>%
dplyr::filter(transcript_name == "SOD1-201")
single_tx <- sod1_exons %>%
dplyr::filter(transcript_name %in% c("SOD1-202"))
multi_tx <- sod1_exons %>%
dplyr::filter(transcript_name %in% c("SOD1-202", "SOD1-203", "SOD1-204"))
##### to_diff #####
testthat::test_that("to_diff() works correctly", {
test_diffs <- to_diff(
exons = single_tx,
ref_exons = mane
)
expect_true(is.data.frame(test_diffs))
expect_true(nrow(test_diffs) > 0)
expect_true(all(
c("seqnames", "start", "end", "strand", "type", "diff_type") %in%
colnames(test_diffs)
))
})
testthat::test_that("to_diff() works correctly for single transcripts", {
test_diffs <- to_diff(
exons = single_tx,
ref_exons = mane,
group_var = "transcript_name"
)
# think the easiest way to check diffs is via plotting
single_tx_diff_plot <- mane %>%
dplyr::bind_rows(single_tx) %>%
ggplot2::ggplot(
aes(
xstart = start,
xend = end,
y = transcript_name
)
) +
geom_range() +
geom_range(
data = test_diffs,
alpha = 0.2,
fill = "red"
)
vdiffr::expect_doppelganger(
"single tx diff plot",
single_tx_diff_plot
)
})
testthat::test_that("to_diff() works correctly for multiple transcripts", {
test_diffs <- to_diff(
exons = multi_tx,
ref_exons = mane,
group_var = "transcript_name"
)
multi_tx_diff_plot <- mane %>%
dplyr::bind_rows(multi_tx) %>%
ggplot2::ggplot(
aes(
xstart = start,
xend = end,
y = transcript_name
)
) +
geom_range() +
geom_range(
data = test_diffs,
aes(fill = diff_type),
alpha = 0.2,
)
vdiffr::expect_doppelganger(
"multi tx diff plot",
multi_tx_diff_plot
)
})
================================================
FILE: tests/testthat/test-to_intron.R
================================================
# create dummy exons for testing
test_exons <-
dplyr::tibble(
start = c(100, 300, 500, 650),
end = start + 100,
strand = c("+", "+", "-", "-"),
tx = c("A", "A", "B", "B")
)
# manually create the expected introns
test_introns <-
dplyr::tibble(
strand = c("+", "-"),
tx = c("A", "B"),
start = c(200, 600),
end = c(300, 650),
type = "intron"
)
pknox1_cds_utr <-
pknox1_annotation %>% dplyr::filter(
type == "CDS" | grepl("utr", type)
)
##### to_intron #####
testthat::test_that("to_intron() obtains introns correctly", {
# with group_var
expect_equal(
test_introns,
test_exons %>% to_intron(group_var = "tx")
)
# without group_var
expect_equal(
test_introns %>% dplyr::filter(tx != "B"),
test_exons %>%
dplyr::filter(tx != "B") %>%
to_intron()
)
})
testthat::test_that(
"to_intron() obtains introns correctly, regardless of exon order",
{
set.seed(32)
expect_equal(
test_introns,
test_exons %>%
.[sample(seq_len(nrow(test_exons))), ] %>%
to_intron(group_var = "tx")
)
}
)
================================================
FILE: tests/testthat/test-utils.R
================================================
# create dummy exons for testing
test_exons <-
dplyr::tibble(
start = c(100, 300, 500, 650),
end = start + 100,
strand = c("+", "+", "-", "-"),
tx = c("A", "A", "B", "B")
)
##### .check_coord_object #####
testthat::test_that(".check_coord_object() works correctly", {
expect_equal(
.check_coord_object(test_exons),
NULL
)
})
testthat::test_that(".check_coord_object() catches user input errors", {
expect_error(
.check_coord_object("1"),
"must be a data.frame"
)
expect_error(
.check_coord_object(test_exons %>% dplyr::select(-start)),
"must have the columns"
)
expect_error(
.check_coord_object(test_exons %>% dplyr::select(-end)),
"must have the columns"
)
expect_error(
.check_coord_object(test_exons, check_seqnames = TRUE),
"must have the column"
)
expect_error(
.check_coord_object(
test_exons %>% dplyr::select(-strand),
check_strand = TRUE
),
"must have the column"
)
})
##### .check_group_var #####
testthat::test_that(".check_group_var() works correctly", {
expect_equal(
.check_group_var(test_exons, group_var = NULL),
NULL
)
expect_equal(
.check_group_var(test_exons, group_var = "tx"),
NULL
)
})
testthat::test_that(".check_group_var() catches user input errors", {
expect_error(
.check_group_var(test_exons, "not_a_col"),
"must be a column in"
)
})
================================================
FILE: tests/testthat.R
================================================
library(testthat)
library(ggtranscript)
test_check("ggtranscript")
================================================
FILE: vignettes/.gitignore
================================================
*.html
*.R
================================================
FILE: vignettes/ggtranscript.Rmd
================================================
---
title: "Getting started"
author:
- name: David Zhang
affiliation:
- UCL
email: dyzhang32@gmail.com
output:
BiocStyle::html_document:
self_contained: yes
toc: true
toc_float: true
toc_depth: 2
code_folding: show
package: "`r pkg_ver('ggtranscript')`"
vignette: >
%\VignetteIndexEntry{Introduction to ggtranscript}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
crop = NULL
)
```
```{r load-ggtranscript}
library(magrittr)
library(dplyr)
library(ggtranscript)
library(ggplot2)
library(rtracklayer)
```
`ggtranscript` is designed to make it easy to visualize transcript structure and annotation using `ggplot2`.
As the intended users are those who work with genetic and/or transcriptomic data in `R`, this tutorial assumes a basic understanding of transcript annotation and familiarity with `ggplot2`.
## Input data
### Example data
In order to showcase the package's functionality, `ggtranscript` includes example transcript annotation for the genes *SOD1* and *PKNOX1*, as well as a set of unannotated junctions associated with *SOD1*. These specific genes are unimportant, chosen arbitrarily for illustration. However, it worth noting that the input data for `ggtranscript`, as a `ggplot2` extension, is required be a [`data.frame`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/data.frame) or [`tibble`](https://tibble.tidyverse.org).
```{r example-data}
sod1_annotation %>% head()
pknox1_annotation %>% head()
sod1_junctions
```
### Importing data from a gtf
You may be asking, what if I have a `gtf` file or a `GRanges` object? The below demonstrates how to wrangle a `gtf` into the required format for `ggtranscript` and extract the relevant annotation for a particular gene of interest.
For the purposes of the vignette, here we download a `gtf` (Ensembl version 105), then load the `gtf` into `R` using `rtracklayer::import()`.
```{r import-gtf-data}
# download ens 105 gtf into a temporary directory
gtf_path <- file.path(tempdir(), "Homo_sapiens.GRCh38.105.chr.gtf.gz")
download.file(
paste0(
"http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/",
"Homo_sapiens.GRCh38.105.chr.gtf.gz"
),
destfile = gtf_path
)
gtf <- rtracklayer::import(gtf_path)
class(gtf)
```
To note, the loaded `gtf` is a `GRanges` class object. The input data for `ggtranscript`, as a `ggplot2` extension, is required be a [`data.frame`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/data.frame) or [`tibble`](https://tibble.tidyverse.org). We can convert a `GRanges` to a `data.frame` using `as.data.frame` or a `tibble` via `dplyr::as_tibble()`. Either is fine with respect to `ggtranscript`, however we prefer `tibble`s over `data.frame`s for several [reasons](https://r4ds.had.co.nz/tibbles.html#tibbles-vs.-data.frame).
```{r convert-gtf-df}
gtf <- gtf %>% dplyr::as_tibble()
class(gtf)
```
Now that the `gtf` is a `tibble` (or `data.frame` object), we can `dplyr::filter()` rows and `dplyr::select()` columns to keep the annotation columns required for any specific gene of interest. Here, we illustrate how you would obtain the annotation for the gene *SOD1*, ready for plotting with `ggtranscript`.
```{r get-sod1-annot}
# filter your gtf for the gene of interest, here "SOD1"
gene_of_interest <- "SOD1"
sod1_annotation_from_gtf <- gtf %>%
dplyr::filter(
!is.na(gene_name),
gene_name == gene_of_interest
)
# extract the required annotation columns
sod1_annotation_from_gtf <- sod1_annotation_from_gtf %>%
dplyr::select(
seqnames,
start,
end,
strand,
type,
gene_name,
transcript_name,
transcript_biotype
)
sod1_annotation_from_gtf %>% head()
```
### Importing data from a bed file
If users would like to plot ranges from a `bed` file using `ggtranscript`, they can first import the `bed` file into `R` using `rtracklayer::import.bed()`. This method will create a `UCSCData` object.
```{r import-bed-data}
# for the example, we'll use the test bed file provided by rtracklayer
test_bed <- system.file("tests/test.bed", package = "rtracklayer")
bed <- rtracklayer::import.bed(test_bed)
class(bed)
```
A `UCSCData` object can be coerced into a `tibble`, a data structure which can be plotted using `ggplot2`/`ggtranscript`, using `dplyr::as_tibble()`.
```{r convert-bed-df}
bed <- bed %>% dplyr::as_tibble()
class(bed)
bed %>% head()
```
## Basic usage
### Required aesthetics
`ggtranscript` introduces 5 new geoms designed to simplify the visualization of transcript structure and annotation; `geom_range()`, `geom_half_range()`, `geom_intron()`, `geom_junction()` and `geom_junction_label_repel()`. The required aesthetics (`aes()`) for these geoms are designed to match the data formats which are widely used in genetic and transcriptomic analyses:
```{r geom-aes, echo = FALSE}
dplyr::tribble(
~`Required aes()`, ~Type, ~Description, ~`Required by`,
#-------|-----|----|--------
"xstart", "integer", "Start position in base pairs", "All geoms",
"xend", "integer", "End position in base pairs", "All geoms",
"y", "charactor or factor", "The group used for the y axis, most often a transcript id or name ", "All geoms",
"label", "integer or charactor", "Variable used to label junction curves", "Only geom_junction_label_repel()",
) %>%
knitr::kable()
```
### Plotting exons and introns {#plotting_exons_and_introns}
In the simplest case, the core components of transcript structure are exons and introns, which can be plotted using `geom_range()` and `geom_intron()`. In order to facilitate this, `ggtranscript` also provides `to_intron()`, which converts exon co-ordinates into introns. Therefore, you can plot transcript structures with only exons as input and just a few lines of code.
> 📌: As `ggtranscript` geoms share required aesthetics, it is recommended to set these via `ggplot()`, rather than in the individual `geom_*()` calls to avoid code duplication.
```{r geom-range-intron}
# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation %>% head()
# extract exons
sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
sod1_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = to_intron(sod1_exons, "transcript_name"),
aes(strand = strand)
)
```
### Differentiating UTRs from the coding sequence
As suggested by it's name, `geom_range()` is designed to visualize range-based transcript annotation. This includes but is not limited to exons. For instance, for protein coding transcripts it can be useful to visually distinguish the coding sequence (CDS) of a transcript from it's UTRs. This can be achieved by adjusting the height and fill of `geom_range()` and overlaying the CDS on top of the exons (including UTRs).
```{r geom-range-intron-w-cds}
# filter for only exons from protein coding transcripts
sod1_exons_prot_cod <- sod1_exons %>%
dplyr::filter(transcript_biotype == "protein_coding")
# obtain cds
sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS")
sod1_exons_prot_cod %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_cds
) +
geom_intron(
data = to_intron(sod1_exons_prot_cod, "transcript_name"),
aes(strand = strand),
arrow.min.intron.length = 500,
)
```
### Plotting junctions
`geom_junction()` plots curved lines that are intended to represent junction reads. Junctions are reads obtained through RNA-sequencing (RNA-seq) data that map with gapped alignment to the genome. Often, this gap is indicative of a splicing event, but can also originate from other genomic events such as indels.
It can be useful to visually overlay junctions on top of an existing transcript structure. For example, this can help to understand which existing transcripts are expressed in the RNA-seq sample or inform the location or interpretation of the novel splice sites.
`geom_junction_label_repel()` adds labels to junction curves. This can useful for labeling junctions with a measure of their expression or support such as read counts or percent-spliced-in. Alternatively, you may choose to visually map this measure to the thickness of the junction curves by adjusting the the size `aes()`. Or, as shown below, both of these options can be combined.
```{r geom-junction, fig.height = 3}
# extract exons and cds for the MANE-select transcript
sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201")
# add transcript name column to junctions for plotting
sod1_junctions <- sod1_junctions %>% dplyr::mutate(transcript_name = "SOD1-201")
sod1_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
fill = "white",
height = 0.25
) +
geom_range(
data = sod1_201_cds
) +
geom_intron(
data = to_intron(sod1_201_exons, "transcript_name")
) +
geom_junction(
data = sod1_junctions,
aes(size = mean_count),
junction.y.max = 0.5
) +
geom_junction_label_repel(
data = sod1_junctions,
aes(label = round(mean_count, 2)),
junction.y.max = 0.5
) +
scale_size_continuous(range = c(0.1, 1))
```
## Visualizing transcript structure differences
### Context
One of the primary reasons for visualizing transcript structures is to better observe the differences between them. Often this can be achieved by simply plotting the exons and introns as shown in [basic usage](#plotting_exons_and_introns). However, for longer, complex transcripts this may not be as straight forward.
For example, the transcripts of *PKNOX1* have relatively long introns, which makes the comparison between transcript structures (especially small differences in exons) more difficult.
> 📌: For relatively short introns, strand arrows may overlap exons. In such cases, the `arrow.min.intron.length` parameter of `geom_intron()` can be used to set the minimum intron length for a strand arrow to be plotted.
```{r transcript-diff-base}
# extract exons
pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
pknox1_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = to_intron(pknox1_exons, "transcript_name"),
aes(strand = strand),
arrow.min.intron.length = 3500
)
```
### Improving transcript structure visualisation using `shorten_gaps()`
`ggtranscript` provides the helper function `shorten_gaps()`, which reduces the size of the gaps (regions that do not overlap an exon). `shorten_gaps()` then rescales the exon and intron co-ordinates, preserving the original exon alignment. This allows you to hone in the differences of interest, such as the exonic structure.
> 📌: The rescaled co-ordinates returned by `shorten_gaps()` will not match the original genomic positions. Therefore, it is recommended that `shorten_gaps()` is used for visualizations purposes only.
```{r shorten-gaps}
# extract exons
pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
pknox1_rescaled <- shorten_gaps(
exons = pknox1_exons,
introns = to_intron(pknox1_exons, "transcript_name"),
group_var = "transcript_name"
)
# shorten_gaps() returns exons and introns all in one data.frame()
# let's split these for plotting
pknox1_rescaled_exons <- pknox1_rescaled %>% dplyr::filter(type == "exon")
pknox1_rescaled_introns <- pknox1_rescaled %>% dplyr::filter(type == "intron")
pknox1_rescaled_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = pknox1_rescaled_introns,
aes(strand = strand),
arrow.min.intron.length = 300
)
```
### Comparing between two transcripts using `geom_half_range()`
If you are interested in the differences between two transcripts, you can use `geom_half_range()` whilst adjusting `range.orientation` to plot the exons from each on the opposite sides of the transcript structure. This can reveal small differences in exon structure, such as those observed here at the 5' ends of *PKNOX1-201* and *PKNOX1-203*.
```{r geom-half-range, fig.height = 3}
# extract the two transcripts to be compared
pknox1_rescaled_201_exons <- pknox1_rescaled_exons %>%
dplyr::filter(transcript_name == "PKNOX1-201")
pknox1_rescaled_203_exons <- pknox1_rescaled_exons %>%
dplyr::filter(transcript_name == "PKNOX1-203")
pknox1_rescaled_201_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = "PKNOX1-201/203"
)) +
geom_half_range() +
geom_intron(
data = to_intron(pknox1_rescaled_201_exons, "transcript_name"),
arrow.min.intron.length = 300
) +
geom_half_range(
data = pknox1_rescaled_203_exons,
range.orientation = "top",
fill = "purple"
) +
geom_intron(
data = to_intron(pknox1_rescaled_203_exons, "transcript_name"),
arrow.min.intron.length = 300
)
```
### Comparing many transcripts to a single reference transcript using `to_diff()`
Sometimes, it can be useful to visualize the differences of several transcripts with respect to one transcript. For example, you may be interested in how other transcripts differ in structure to the MANE-select transcript. This exploration can reveal whether certain important regions are missing or novel regions are added, hinting at differences in transcript function.
`to_diff()` is a helper function designed for this situation. Here, we apply this to *PKNOX1*, finding the differences between all other transcripts and the MANE-select transcript (*PKNOX1-201*).
> 📌: Although below, we apply `to_diff()` to the rescaled exons and intron (outputted by `shorten_gaps()`), `to_diff()` can also be applied to the original, unscaled transcripts with the same effect.
```{r to-diff}
mane <- pknox1_rescaled_201_exons
not_mane <- pknox1_rescaled_exons %>%
dplyr::filter(transcript_name != "PKNOX1-201")
pknox1_rescaled_diffs <- to_diff(
exons = not_mane,
ref_exons = mane,
group_var = "transcript_name"
)
pknox1_rescaled_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range() +
geom_intron(
data = pknox1_rescaled_introns,
arrow.min.intron.length = 300
) +
geom_range(
data = pknox1_rescaled_diffs,
aes(fill = diff_type),
alpha = 0.2
)
```
## Integrating existing `ggplot2` functionality
As a `ggplot2` extension, `ggtranscript` inherits `ggplot2`'s familiarity and flexibility, enabling users to intuitively adjust aesthetics, parameters, scales etc as well as complement `ggtranscript` geoms with existing `ggplot2` geoms to create informative, publication-ready plots.
Below is a list outlining some examples of complementing `ggtranscript` with existing `ggplot2` functionality that we have found useful:
- Adding exon annotation such as [exon number/order](https://dzhang32.github.io/ggtranscript/reference/add_exon_number.html) using `add_exon_number()` and `geom_text()`
```{r exon-num-ex}
base_sod1_plot <- sod1_exons %>%
ggplot(aes(
xstart = start,
xend = end,
y = transcript_name
)) +
geom_range(
aes(fill = transcript_biotype)
) +
geom_intron(
data = to_intron(sod1_exons, "transcript_name"),
aes(strand = strand)
)
base_sod1_plot +
geom_text(
data = add_exon_number(sod1_exons, "transcript_name"),
aes(
x = (start + end) / 2, # plot label at midpoint of exon
label = exon_number
),
size = 3.5,
nudge_y = 0.4
)
```
- Zooming in on areas of interest using `coord_cartesian()` or `ggforce::facet_zoom()`
```{r zoom-ex}
base_sod1_plot +
coord_cartesian(xlim = c(31665500, 31669000))
```
- Plotting mutations using `geom_vline()`
```{r mutation-ex}
example_mutation <- dplyr::tibble(
transcript_name = "SOD1-204",
position = 31661600
)
# xstart and xend are set here to override the default aes()
base_sod1_plot +
geom_vline(
data = example_mutation,
aes(
xintercept = position,
xstart = NULL,
xend = NULL
),
linetype = 2,
colour = "red"
)
```
- Beautifying plots using themes and scales
```{r beautify-ex}
base_sod1_plot +
theme_bw() +
scale_x_continuous(name = "Position") +
scale_y_discrete(name = "Transcript name") +
scale_fill_discrete(
name = "Transcript biotype",
labels = c("Processed transcript", "Protein-coding")
)
```
## Session info
Show/hide
```{r session-info, echo = FALSE}
library("sessioninfo")
options(width = 120)
session_info()
```