Repository: dzhang32/ggtranscript Branch: master Commit: 682a0df688ad Files: 62 Total size: 227.7 KB Directory structure: gitextract_nqf_jr3r/ ├── .Rbuildignore ├── .github/ │ ├── .gitignore │ └── workflows/ │ └── check-bioc.yml ├── .gitignore ├── .pre-commit-config.yaml ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R/ │ ├── add_exon_number.R │ ├── add_utr.R │ ├── data.R │ ├── geom_half_range.R │ ├── geom_intron.R │ ├── geom_junction.R │ ├── geom_junction_label_repel.R │ ├── geom_range.R │ ├── ggtranscript-package.R │ ├── globals.R │ ├── shorten_gaps.R │ ├── to_diff.R │ ├── to_intron.R │ └── utils.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── codecov.yml ├── data/ │ ├── pknox1_annotation.rda │ ├── sod1_annotation.rda │ └── sod1_junctions.rda ├── data-raw/ │ ├── ggplot2_exts_thumbnail.R │ ├── ggtranscript_logo.R │ ├── sod1_junctions.R │ └── sod1_pknox1_annotation.R ├── inst/ │ └── CITATION ├── man/ │ ├── add_exon_number.Rd │ ├── add_utr.Rd │ ├── geom_intron.Rd │ ├── geom_junction.Rd │ ├── geom_junction_label_repel.Rd │ ├── geom_range.Rd │ ├── ggtranscript.Rd │ ├── shorten_gaps.Rd │ ├── sod1_annotation.Rd │ ├── sod1_junctions.Rd │ ├── to_diff.Rd │ └── to_intron.Rd ├── tests/ │ ├── testthat/ │ │ ├── test-add_exon_number.R │ │ ├── test-add_utr.R │ │ ├── test-geom_half_range.R │ │ ├── test-geom_intron.R │ │ ├── test-geom_junction.R │ │ ├── test-geom_junction_label_repel.R │ │ ├── test-geom_range.R │ │ ├── test-shorten_gaps.R │ │ ├── test-to_diff.R │ │ ├── test-to_intron.R │ │ └── test-utils.R │ └── testthat.R └── vignettes/ ├── .gitignore └── ggtranscript.Rmd ================================================ FILE CONTENTS ================================================ ================================================ FILE: .Rbuildignore ================================================ ^ggtranscript\.Rproj$ ^\.Rproj\.user$ ^dev$ ^README\.Rmd$ ^\.github$ ^codecov\.yml$ ^.pre-commit-config.yaml$ ^data-raw$ ^_pkgdown.yml$ ^tests/testthat/_snaps/* ^LICENSE\.md$ ^cran-comments\.md$ ================================================ FILE: .github/.gitignore ================================================ *.html ================================================ FILE: .github/workflows/check-bioc.yml ================================================ on: push: pull_request: name: R-CMD-check-bioc ## These environment variables control whether to run GHA code later on that is ## specific to testthat, covr, and pkgdown. env: has_testthat: 'true' run_covr: 'true' run_pkgdown: 'true' has_RUnit: 'false' cache-version: 'cache-v1' jobs: build-check: runs-on: ${{ matrix.config.os }} name: ${{ matrix.config.os }} (${{ matrix.config.r }}) container: ${{ matrix.config.cont }} strategy: fail-fast: false matrix: config: - { os: ubuntu-latest, r: '4.4', bioc: '3.19', cont: "bioconductor/bioconductor_docker:RELEASE_3_19", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} NOT_CRAN: true TZ: UTC GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - name: Set R Library home on Linux if: runner.os == 'Linux' run: | mkdir /__w/_temp/Library echo ".libPaths('/__w/_temp/Library')" > ~/.Rprofile - name: Checkout Repository uses: actions/checkout@v3 - name: Setup R from r-lib if: runner.os != 'Linux' uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} http-user-agent: ${{ matrix.config.http-user-agent }} - name: Setup pandoc from r-lib if: runner.os != 'Linux' uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | install.packages('remotes') saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) shell: Rscript {0} - name: Restore R package cache if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'" uses: actions/cache@v3 with: path: ${{ env.R_LIBS_USER }} key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-${{ hashFiles('.github/depends.Rds') }} restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1- - name: Cache R packages on Linux if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " uses: actions/cache@v3 with: path: /home/runner/work/_temp/Library key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-${{ hashFiles('.github/depends.Rds') }} restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1- - name: Install Linux system dependencies if: runner.os == 'Linux' run: | sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') echo $sysreqs sudo -s eval "$sysreqs" - name: Install BiocManager run: | message(paste('****', Sys.time(), 'installing BiocManager ****')) remotes::install_cran("BiocManager") shell: Rscript {0} - name: Set BiocVersion run: | BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE) shell: Rscript {0} - name: Install dependencies pass 1 run: | ## Try installing the package dependencies in steps. First the local ## dependencies, then any remaining dependencies to avoid the ## issues described at ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html ## https://github.com/r-lib/remotes/issues/296 ## Ideally, all dependencies should get installed in the first pass. ## Set the repos source depending on the OS ## Alternatively use https://storage.googleapis.com/bioconductor_docker/packages/ ## though based on https://bit.ly/bioc2021-package-binaries ## the Azure link will be the main one going forward. gha_repos <- if( .Platform$OS.type == "unix" && Sys.info()["sysname"] != "Darwin" ) c( "AnVIL" = "https://bioconductordocker.blob.core.windows.net/packages/3.14/bioc", BiocManager::repositories() ) else BiocManager::repositories() ## For running the checks message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) install.packages(c("rcmdcheck", "BiocCheck"), repos = gha_repos) ## Pass #1 at installing dependencies ## This pass uses AnVIL-powered fast binaries ## details at https://github.com/nturaga/bioc2021-bioconductor-binaries ## The speed gains only apply to the docker builds. message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****')) remotes::install_local(dependencies = TRUE, repos = gha_repos, build_vignettes = FALSE, upgrade = TRUE) continue-on-error: true shell: Rscript {0} - name: Install dependencies pass 2 run: | ## Pass #2 at installing dependencies ## This pass does not use AnVIL and will thus update any packages ## that have seen been updated in Bioconductor message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****')) remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE) shell: Rscript {0} - name: Install BiocGenerics if: env.has_RUnit == 'true' run: | ## Install BiocGenerics BiocManager::install("BiocGenerics") shell: Rscript {0} - name: Install covr if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux' run: | remotes::install_cran("covr") shell: Rscript {0} - name: Install pkgdown if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: | remotes::install_cran("pkgdown") shell: Rscript {0} - name: Session info run: | options(width = 100) pkgs <- installed.packages()[, "Package"] sessioninfo::session_info(pkgs, include_base = TRUE) shell: Rscript {0} - name: Run CMD check env: _R_CHECK_CRAN_INCOMING_: false DISPLAY: 99.0 run: | options(crayon.enabled = TRUE) rcmdcheck::rcmdcheck( args = c("--no-manual", "--no-vignettes", "--timings"), build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), error_on = "warning", check_dir = "check" ) shell: Rscript {0} ## Might need an to add this to the if: && runner.os == 'Linux' - name: Reveal testthat details if: env.has_testthat == 'true' run: find . -name testthat.Rout -exec cat '{}' ';' - name: Run RUnit tests if: env.has_RUnit == 'true' run: | BiocGenerics:::testPackage() shell: Rscript {0} - name: Run BiocCheck env: DISPLAY: 99.0 run: | BiocCheck::BiocCheck( dir('check', 'tar.gz$', full.names = TRUE), `quit-with-status` = FALSE, `no-check-R-ver` = TRUE, `no-check-bioc-help` = TRUE ) shell: Rscript {0} - name: Test coverage if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux' run: | covr::codecov() shell: Rscript {0} - name: Install package if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: R CMD INSTALL . - name: Get R package info if: runner.os == 'Linux' run: | #### DockerHub repos must be lowercase (,,) #### name=$(grep '^Package:' DESCRIPTION | cut -d\ -f2) echo "packageNameOrig=${name}" >> $GITHUB_ENV echo $name version=$(grep Version DESCRIPTION | grep -o "[0-9.]\+") echo "packageVersion=${version}" >> $GITHUB_ENV echo $version shell: bash {0} - name: Build and deploy pkgdown site if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: | git config --global --add safe.directory /__w/${{env.packageNameOrig}}/${{env.packageNameOrig}} git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)" shell: bash {0} ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE) ## at least one locally before this will work. This creates the gh-pages ## branch (erasing anything you haven't version controlled!) and ## makes the git history recognizable by pkgdown. - name: Upload check results if: failure() uses: actions/upload-artifact@master with: name: ${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-results path: check ================================================ FILE: .gitignore ================================================ .Rproj.user inst/doc *.DS_Store *.pdf *.Rproj ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/lorenzwalthert/precommit rev: v0.1.3.9133 hooks: - id: readme-rmd-rendered # make sure README.Rmd is rendered to README.md - id: parsable-R exclude: > (?x)^( tests/testthat/in/style-files-fail-parse\.R| tests/testthat/in/parsable-R-fail\.R| )$ - id: style-files # style code in the tidyverse style args: [--indent_by=4] exclude: > (?x)^( tests/testthat/in/.*\.R| renv/.* )$ - id: deps-in-desc # all dependencies pkg::func are in listed in dec args: [--allow_private_imports] exclude: > (?x)^( tests/testthat/in/.*| inst/renv-update\.R| renv/activate.R| vignettes/FAQ\.Rmd| )$ - id: lintr args: [--warn_only] verbose: true - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.0.1 hooks: - id: check-added-large-files # make sure no large files commited - id: end-of-file-fixer exclude: '\.Rd' ================================================ FILE: DESCRIPTION ================================================ Package: ggtranscript Title: Visualizing Transcript Structure and Annotation using 'ggplot2' Version: 1.0.0 Authors@R: c( person("David", "Zhang", , "dyzhang32@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2382-8460")), person("Emil", "Gustavsson", , "e.gustavsson@ucl.ac.uk", role = c("aut"), comment = c(ORCID = "0000-0003-0541-7537")), person("Regina", "Reynolds", , "regina.reynolds.16@ucl.ac.uk", role = c("ctb"), comment = c(ORCID = "0000-0001-6470-7919")), person("Sonia", "Ruiz", , "s.ruiz@ucl.ac.uk", role = c("ctb")) ) Description: The goal of ggtranscript is the simplify the process of visualizing transcript structure and annotation. To achieve this, ggtranscript introduces 5 new geoms (geom_range(), geom_half_range(), geom_intron(), geom_junction() and geom_junction_label_repel()) as well as several helper functions. As a 'ggplot2' extension, ggtranscript inherits 'ggplot2's familiarity and flexibility, enabling users to intuitively adjust aesthetics, parameters, scales etc as well as complement ggtranscript geoms with existing 'ggplot2' geoms to create informative, publication-ready plots. License: MIT + file LICENSE URL: https://github.com/dzhang32/ggtranscript BugReports: https://github.com/dzhang32/ggtranscript/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 Suggests: BiocStyle, covr, ggpubr, knitr, rmarkdown, rtracklayer, sessioninfo, testthat (>= 3.0.0), vdiffr Config/testthat/edition: 3 VignetteBuilder: knitr Depends: R (>= 2.10) LazyData: true Imports: dplyr, GenomicRanges, ggplot2, magrittr, rlang, S4Vectors, GenomeInfoDb, ggrepel Collate: 'add_exon_number.R' 'add_utr.R' 'data.R' 'geom_range.R' 'geom_half_range.R' 'geom_intron.R' 'geom_junction.R' 'geom_junction_label_repel.R' 'ggtranscript-package.R' 'globals.R' 'shorten_gaps.R' 'to_diff.R' 'to_intron.R' 'utils.R' ================================================ FILE: LICENSE ================================================ YEAR: 2022 COPYRIGHT HOLDER: ggtranscript authors ================================================ FILE: LICENSE.md ================================================ # MIT License Copyright (c) 2022 ggtranscript authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: NAMESPACE ================================================ # Generated by roxygen2: do not edit by hand export(add_exon_number) export(add_utr) export(geom_half_range) export(geom_intron) export(geom_junction) export(geom_junction_label_repel) export(geom_range) export(shorten_gaps) export(to_diff) export(to_intron) import(ggrepel) importFrom(ggplot2,aes) importFrom(magrittr,"%>%") importFrom(rlang,"%||%") ================================================ FILE: NEWS.md ================================================ # ggtranscript 1.0.0 ## Fixes * Updates `geom`s to work with the latest version of `ggplot2` and `ggrepel` (R version `4.4`). * Fixes all unit tests. * Fixes CI workflow issues and simplifies testing to linux-only. # ggtranscript 0.99.9 ## NEW FEATURES * Address ggtranscript reviews; update docs with examples of using gtf/bed files, integration with `ggplot2` extensions and add usage of `shorten_gaps()` to README. # ggtranscript 0.99.8 ## NEW FEATURES * Address CRAN feedback; changing ggplot2 -> 'ggplot2', remove biocViews and contributing, removing Date field in DESCRIPTION. # ggtranscript 0.99.7 ## NEW FEATURES * Add `cran-comments.md` in preparation for first CRAN submission. * Update CI to run `R CMD Check` on latest R version (4.2). # ggtranscript 0.99.6 ## NEW FEATURES * Add `@return` documentation for `geom_*` functions for `BiocCheck`. # ggtranscript 0.99.5 ## NEW FEATURES * Change branch to naming from main to master to match BBS. # ggtranscript 0.99.4 ## NEW FEATURES * Change email to UCL email for Bioconductor submission. ## NEW FEATURES * Add `add_utr()` for adding UTRs as ranges. This helper function is designed to work with `shorten_gaps()`, enabling shortening of gaps whilst visually differentiating UTRs from the CDS. * Allow `to_intron()` to take CDS and UTRs ranges as input. * Submit to Bioconductor. # ggtranscript 0.99.2 ## NEW FEATURES * Add `geom_junction_label_repel()` for labeling junctions (e.g. with counts). * Add `add_exon_number()` for visualizing the exon number/order. # ggtranscript 0.99.1 ## NEW FEATURES * Implement base geoms: `geom_range()`, `geom_half_range()`, `geom_intron()`, `geom_junction()` and helper functions: `to_intron()`, `to_diff()` and `shorten_gaps()`. ================================================ FILE: R/add_exon_number.R ================================================ #' Add exon number #' #' `add_exon_number()` adds the exon number (the order the exons are transcribed #' within each transcript) as a column in `exons`. This can be useful when #' visualizing long, complex transcript structures, in order to keep track of #' specific exons of interest. #' #' To note, a "strand" column must be present within `exons`. The strand is used #' to differentiate whether exon numbers should be calculated according to #' ascending ("+") or descending ("-") genomic co-ordinates. For ambiguous #' strands ("*"), `add_exon_number()` will be assume the strand be "+". #' #' @inheritParams to_diff #' #' @return `data.frame()` equivalent to input `exons`, with the additional #' column "exon_number". #' #' @export #' @examples #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' sod1_annotation %>% head() #' #' # extract exons #' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") #' sod1_exons %>% head() #' #' # add the exon number for each transcript #' sod1_exons <- sod1_exons %>% add_exon_number(group_var = "transcript_name") #' #' base <- sod1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = to_intron(sod1_exons, "transcript_name"), #' strand = "+" #' ) #' #' # it can be useful to annotate exons with their exon number #' # using ggplot2::geom_text() #' base + #' geom_text(aes( #' x = (start + end) / 2, # plot label at midpoint of exon #' label = exon_number #' ), #' size = 3.5, #' nudge_y = 0.4 #' ) #' #' # Or alternatively, using ggrepel::geom_label_repel() #' # to separate labels from exons #' base + #' ggrepel::geom_label_repel(ggplot2::aes( #' x = (start + end) / 2, #' label = exon_number #' ), #' size = 3.5, #' min.segment.length = 0 #' ) add_exon_number <- function(exons, group_var = NULL) { .check_coord_object(exons, check_strand = TRUE) .check_group_var(exons, group_var) if (!is.null(group_var)) { exons <- exons %>% dplyr::group_by_at(.vars = group_var) } # arrange to make sure order reflects genomic position exons <- exons %>% dplyr::arrange_at(c(.vars = c(group_var, "start", "end"))) # add exon number, assuming all plus strand at start exons <- exons %>% dplyr::mutate( exon_number = dplyr::row_number(), n_exons = dplyr::n() ) %>% dplyr::ungroup() # convert exon number for minus strand exons <- exons %>% dplyr::mutate( exon_number = ifelse( strand == "-", n_exons - exon_number + 1, exon_number ) ) %>% dplyr::select(-n_exons) return(exons) } ================================================ FILE: R/add_utr.R ================================================ #' Add untranslated regions (UTRs) #' #' Given a set of `exons` (encompassing the CDS and UTRs) and `cds` regions, #' `add_utr()` will calculate and add the corresponding UTR regions as ranges. #' This can be useful when combined with `shorten_gaps()` to visualize #' transcripts with long introns, whilst differentiating UTRs from CDS regions. #' #' The definition of the inputted `cds` regions are expected to range from the #' beginning of the start codon to the end of the stop codon. Sometimes, for #' example in the case of Ensembl, reference annotation will omit the stop #' codons from the CDS definition. In such cases, users should manually ensure #' that the `cds` includes both the start and stop codons. #' #' @inheritParams to_diff #' @param cds `data.frame()` contains coding sequence ranges for the transcripts #' in `exons`. #' #' @return `data.frame()` contains differentiated CDS and UTR ranges. #' #' @export #' @examples #' #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' pknox1_annotation %>% head() #' #' # extract exons #' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") #' pknox1_exons %>% head() #' #' # extract cds #' pknox1_cds <- pknox1_annotation %>% dplyr::filter(type == "CDS") #' pknox1_cds %>% head() #' #' # the CDS definition originating from the Ensembl reference annotation #' # does not include the stop codon #' # we must incorporate the stop codons into the CDS manually #' # by adding 3 base pairs to the end of the CDS of each transcript #' pknox1_cds_w_stop <- pknox1_cds %>% #' dplyr::group_by(transcript_name) %>% #' dplyr::mutate( #' end = ifelse(end == max(end), end + 3, end) #' ) %>% #' dplyr::ungroup() #' #' # add_utr() adds ranges that represent the UTRs #' pknox1_cds_utr <- add_utr( #' pknox1_exons, #' pknox1_cds_w_stop, #' group_var = "transcript_name" #' ) #' #' pknox1_cds_utr %>% head() #' #' # this can be useful when combined with shorten_gaps() #' # to visualize transcripts with long introns whilst differentiating UTRs #' pknox1_cds_utr_rescaled <- #' shorten_gaps( #' exons = pknox1_cds_utr, #' introns = to_intron(pknox1_cds_utr, "transcript_name"), #' group_var = "transcript_name" #' ) #' #' pknox1_cds_utr_rescaled %>% #' dplyr::filter(type == "CDS") %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_range( #' data = pknox1_cds_utr_rescaled %>% dplyr::filter(type == "UTR"), #' height = 0.25, #' fill = "white" #' ) + #' geom_intron( #' data = to_intron( #' pknox1_cds_utr_rescaled %>% dplyr::filter(type != "intron"), #' "transcript_name" #' ), #' arrow.min.intron.length = 110 #' ) add_utr <- function(exons, cds, group_var = NULL) { # input checks .check_coord_object(exons, check_seqnames = TRUE) .check_group_var(exons, group_var) .check_coord_object(cds, check_seqnames = TRUE) .check_group_var(cds, group_var) # we have to create dummy group for downstream for loop if there is no group null_group <- is.null(group_var) if (null_group) { exons <- exons %>% dplyr::mutate(dummy_group = "A") cds <- cds %>% dplyr::mutate(dummy_group = "A") group_var <- "dummy_group" } groups <- cds[[group_var]] %>% unique() # convert to GenomicRanges for downstream processing exons_gr <- exons %>% GenomicRanges::GRanges() cds_gr <- cds %>% GenomicRanges::GRanges() exons_w_utr <- vector("list", length = length(groups)) for (i in seq_along(groups)) { exons_gr_curr <- exons_gr %>% .[GenomicRanges::mcols(exons_gr)[[group_var]] == groups[i]] cds_gr_curr <- cds_gr %>% .[GenomicRanges::mcols(cds_gr)[[group_var]] == groups[i]] # use setdiff to get regions in exon but not in cds (i.e. the utrs) utrs_curr <- GenomicRanges::setdiff(exons_gr_curr, cds_gr_curr) GenomicRanges::mcols(utrs_curr)[[group_var]] <- groups[i] utrs_curr$type <- "UTR" cds_gr_curr$type <- "CDS" exons_w_utr[[i]] <- c(utrs_curr, cds_gr_curr) %>% sort() } exons_w_utr <- exons_w_utr %>% do.call(c, .) %>% as.data.frame() %>% dplyr::as_tibble() # remove dummp_group if created if (null_group) { exons_w_utr <- exons_w_utr %>% dplyr::select(-dummy_group) } return(exons_w_utr) } ================================================ FILE: R/data.R ================================================ #' Example transcript annotation #' #' Transcript annotation including the co-ordinates (hg38) of the genes, #' transcripts, exons and CDS regions for \emph{SOD1} and \emph{PKNOX1}, which #' originate from version 105 of the Ensembl reference annotation. #' #' @format A `tibble::tibble()`: #' \describe{ #' \item{seqnames}{`factor()` chromosome.} #' \item{start}{`integer()` start position.} #' \item{end}{`integer()` end position.} #' \item{strand}{`factor()` strand.} #' \item{type}{`factor()` E.g.gene, transcript, exon or CDS.} #' \item{gene_name}{`character()` name of gene (GBA).} #' \item{transcript_name}{`character()` name of transcript.} #' \item{transcript_biotype}{`character()` biotype of transcript.} #' } #' #' @source generated using `ggtranscript/data-raw/sod1_pknox1_annotation.R` "sod1_annotation" #' @rdname sod1_annotation "pknox1_annotation" #' Example junctions #' #' Junction co-ordinates and counts associated with the \emph{SOD1} gene. #' Junctions counts originate from GTEx liver samples and are downloaded via the #' Bioconductor package `snapcount`. Only unannotated junctions with a mean #' count above 0.3 have been retained for this example. #' #' @format A `tibble::tibble()`: #' \describe{ #' \item{seqnames}{`factor()` chromosome.} #' \item{start}{`integer()` start position.} #' \item{end}{`integer()` end position.} #' \item{strand}{`factor()` strand.} #' \item{mean_count}{`factor()` Average count across all GTEx liver samples.} #' } #' #' @source generated using `ggtranscript/data-raw/sod1_junctions.R` "sod1_junctions" ================================================ FILE: R/geom_half_range.R ================================================ #' @param range.orientation `character()` one of "top" or "bottom", specifying #' where the half ranges will be plotted with respect to each transcript #' (`y`). #' #' @export #' @rdname geom_range geom_half_range <- function(mapping = NULL, data = NULL, stat = "identity", position = "identity", ..., range.orientation = "bottom", linejoin = "mitre", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) { ggplot2::layer( data = data, mapping = mapping, stat = stat, geom = GeomHalfRange, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( range.orientation = range.orientation, linejoin = linejoin, na.rm = na.rm, ... ) ) } #' `GeomHalfRange` is `GeomRange` with default parameters for `vjust` and #' `height` as well as the added parameter `range.orientation` #' #' @include geom_range.R #' @keywords internal #' @noRd GeomHalfRange <- ggplot2::ggproto("GeomHalfRange", GeomRange, setup_data = function(data, params) { # check that range.orientation is one of possible options .check_range.orientation(params) # modified from ggplot2::GeomTile data$height <- data$height %||% params$height %||% 0.25 transform( data, xmin = xstart, xmax = xend, ymin = y - height / 2, ymax = y + height / 2, height = NULL ) }, draw_panel = function(data, panel_params, coord, range.orientation = "bottom", lineend = "butt", linejoin = "mitre") { vjust <- ifelse( range.orientation == "bottom", 1.5, 0.5 ) GeomRange$draw_panel( data = data, panel_params = panel_params, coord = coord, vjust = vjust, lineend = lineend, linejoin = linejoin ) } ) #' @keywords internal #' @noRd .check_range.orientation <- function(params) { not_orient_option <- !(params$range.orientation %in% c("top", "bottom")) if (not_orient_option) { stop( "range.orientation must be one of ", "'alternating', 'top' or 'bottom'" ) } } ================================================ FILE: R/geom_intron.R ================================================ #' Plot intron lines with strand arrows #' #' `geom_intron()` draws horizontal lines with central arrows that are designed #' to represent introns. In combination with `geom_range()`/`geom_half_range()`, #' these geoms form the core components for visualizing transcript structures. #' #' `geom_intron()` requires the following `aes()`; `xstart`, `xend` and `y` #' (e.g. transcript name). If users do not have intron co-ordinates, these can #' be generated from the corresponding exons using `to_intron()`. The `strand` #' option (one of "+" or "-") adjusts the arrow direction to match the direction #' of transcription. The `arrow.min.intron.length` parameter can be useful to #' remove strand arrows that overlap exons, which can be a problem if plotted #' introns include those that are relatively short. #' #' @inheritParams ggplot2::layer #' @inheritParams ggplot2::geom_point #' @inheritParams ggplot2::geom_segment #' @param arrow.min.intron.length `integer()` the minimum required width of an #' intron for a strand arrow to be drawn. This can be useful to remove strand #' arrows on short introns that overlap adjacent exons. #' #' @return the return value of a `geom_*` function is not intended to be #' directly handled by users. Therefore, `geom_*` functions should never be #' executed in isolation, rather used in combination with a #' `ggplot2::ggplot()` call. #' #' @export #' @examples #' #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' pknox1_annotation %>% head() #' #' # extract exons #' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") #' pknox1_exons %>% head() #' #' # to_intron() is a helper function included in ggtranscript #' # which is useful for converting exon co-ordinates to introns #' pknox1_introns <- pknox1_exons %>% to_intron(group_var = "transcript_name") #' pknox1_introns %>% head() #' #' base <- pknox1_introns %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) #' #' # by default, geom_intron() assumes introns originate from the "+" strand #' base + geom_intron() #' #' # however this can be modified using the strand option #' base + geom_intron(strand = "-") #' #' # strand can also be set as an aes() #' base + geom_intron(aes(strand = strand)) #' #' # as a ggplot2 extension, ggtranscript geoms inherit the #' # the functionality from the parameters and aesthetics in ggplot2 #' base + geom_intron( #' aes(colour = transcript_name), #' linewidth = 1 #' ) #' #' # together, geom_range() and geom_intron() are designed to visualize #' # the core components of transcript annotation #' pknox1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = pknox1_introns #' ) #' #' # for short introns, sometimes strand arrows will overlap exons #' # to avoid this, users can set the arrow.min.intron.length parameter #' pknox1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = pknox1_introns, #' arrow.min.intron.length = 3500 #' ) geom_intron <- function(mapping = NULL, data = NULL, stat = "identity", position = "identity", ..., arrow = grid::arrow(ends = "last", length = grid::unit(0.1, "inches")), arrow.fill = NULL, lineend = "butt", linejoin = "round", na.rm = FALSE, arrow.min.intron.length = 0, show.legend = NA, inherit.aes = TRUE) { ggplot2::layer( data = data, mapping = mapping, stat = stat, geom = GeomIntron, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( arrow = arrow, arrow.fill = arrow.fill, lineend = lineend, linejoin = linejoin, na.rm = na.rm, arrow.min.intron.length = arrow.min.intron.length, ... ) ) } #' `GeomIntron` is pretty much `ggplot2::GeomSegment` with the `required_aes` #' changed to `xstart`/`xend` to match genetic nomenclature and the added arrows #' to indicate direction of transcription (configured with `strand` and #' `arrow.min.intron.length`) #' @noRd GeomIntron <- ggplot2::ggproto("GeomIntron", ggplot2::GeomSegment, required_aes = c("xstart", "xend", "y"), default_aes = aes( colour = "black", linewidth = 0.5, linetype = 1, alpha = NA, strand = "+" ), setup_params = function(data, params) { # check that arrow.min.intron.length numeric is >= 0 arrow.min_numeric <- is.numeric(params$arrow.min.intron.length) arrow.min_neg <- params$arrow.min.intron.length < 0 if (!arrow.min_numeric | arrow.min_neg) { stop("arrow.min.intron.length must be a numeric > 0") } params }, setup_data = function(data, params) { # needed to permit usage of xstart/xend transform( data, x = xstart, yend = y, xstart = NULL ) }, draw_panel = function(data, panel_params, coord, arrow = NULL, arrow.fill = NULL, lineend = "butt", linejoin = "round", na.rm = FALSE, arrow.min.intron.length = 0) { # check that strand is scalar and one of "+" or "-" .check_strand(data$strand) # first, create the intron grob, which is just a pure line (no arrow) intron_grob <- ggplot2::GeomSegment$draw_panel( data = data, panel_params = panel_params, coord = coord, arrow = NULL, arrow.fill = NULL, lineend = lineend, linejoin = linejoin, na.rm = na.rm ) # then, create the arrow grobs, one per strand # need both as the direction of arrow (as far I can tell) is # is dependent on the orientation of the x/xend strand_arrow_plus_grob <- .create_strand_arrow_grob( target_strand = "+", arrow.min.intron.length = arrow.min.intron.length, data = data, panel_params = panel_params, coord = coord, arrow = arrow, arrow.fill = arrow.fill, lineend = lineend, linejoin = linejoin, na.rm = na.rm ) strand_arrow_minus_grob <- .create_strand_arrow_grob( target_strand = "-", arrow.min.intron.length = arrow.min.intron.length, data = data, panel_params = panel_params, coord = coord, arrow = arrow, arrow.fill = arrow.fill, lineend = lineend, linejoin = linejoin, na.rm = na.rm ) # draw_panel expects return of a grob # here, as we build multiple grobs (i.e. intron lines + arrows) # we use a grobTree to combine the two grid::grobTree( intron_grob, strand_arrow_plus_grob, strand_arrow_minus_grob ) } ) #' @keywords internal #' @noRd .check_strand <- function(strand) { # TODO - add option for "*" arrow? any_na <- any(is.na(strand)) plus_minus <- !(all(strand %in% c("+", "-"))) if (any_na | plus_minus) { stop("strand values must be one of '+' and '-'") } return(invisible()) } #' @keywords internal #' @noRd .create_strand_arrow_grob <- function(target_strand, arrow.min.intron.length, data, panel_params, coord, arrow, arrow.fill, lineend, linejoin, na.rm) { # filter for introns that match target strand # and have a length above arrow.min.intron.length match_strand <- data$strand == target_strand ab_min <- abs(data$x - data$xend) > arrow.min.intron.length arrow_data <- data[match_strand & ab_min, ] # if there are no arrows to plot, use a nullGrob() to add nothing if (nrow(arrow_data) == 0) { arrow_grob <- grid::nullGrob() } else { # obtain the the correct orientation of arrow (dependent on strand) # as the arrow can only be placed at either end of a geom_segment/path # the strand changes the x/xends around, shifting the around direction if (target_strand == "+") { arrow_data <- transform( arrow_data, xend = (x + xend) / 2 ) } else { arrow_data <- transform( arrow_data, mid = (x + xend) / 2, x = xend ) arrow_data <- transform( arrow_data, xend = mid ) } arrow_grob <- ggplot2::GeomSegment$draw_panel( data = arrow_data, panel_params = panel_params, coord = coord, arrow = arrow, arrow.fill = arrow.fill, lineend = lineend, linejoin = linejoin, na.rm = na.rm ) } return(arrow_grob) } ================================================ FILE: R/geom_junction.R ================================================ #' Plot junction curves #' #' `geom_junction()` draws curves that are designed to represent junction reads #' from RNA-sequencing data. It can be useful to overlay junction data on #' transcript annotation (plotted using `geom_range()`/`geom_half_range()` and #' `geom_intron()`) to understand which splicing events or transcripts have #' support from RNA-sequencing data. #' #' `geom_junction()` requires the following `aes()`; `xstart`, `xend` and `y` #' (e.g. transcript name). `geom_junction()` curves can be modified using #' `junction.y.max`, which can be useful when junctions overlap one #' another/other transcripts or extend beyond the plot margins. By default, #' junction curves will alternate between being plotted on the top and bottom of #' each transcript (`y`), however this can be modified via #' `junction.orientation`. #' #' @inheritParams ggplot2::layer #' @inheritParams ggplot2::geom_bar #' @inheritParams grid::curveGrob #' @param junction.orientation `character()` one of "alternating", "top" or #' "bottom", specifying where the junctions will be plotted with respect to #' each transcript (`y`). #' @param junction.y.max `double()` the max y-value of each junction curve. It #' can be useful to adjust this parameter when junction curves overlap with #' one another/other transcripts or extend beyond the plot margins. #' #' @return the return value of a `geom_*` function is not intended to be #' directly handled by users. Therefore, `geom_*` functions should never be #' executed in isolation, rather used in combination with a #' `ggplot2::ggplot()` call. #' #' @export #' @examples #' #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' sod1_annotation %>% head() #' #' # as well as a set of example (unannotated) junctions #' # originating from GTEx and downloaded via the Bioconductor package snapcount #' sod1_junctions #' #' # extract exons #' sod1_exons <- sod1_annotation %>% dplyr::filter( #' type == "exon", #' transcript_name == "SOD1-201" #' ) #' sod1_exons %>% head() #' #' # add transcript_name to junctions for plotting #' sod1_junctions <- sod1_junctions %>% #' dplyr::mutate(transcript_name = "SOD1-201") #' #' # junctions can be plotted as curves using geom_junction() #' base <- sod1_junctions %>% #' ggplot2::ggplot(ggplot2::aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) #' #' # sometimes, depending on the number and widths of transcripts and junctions #' # junctions will go overlap one another or extend beyond the plot margin #' base + geom_junction() #' #' # in such cases, junction.y.max can be adjusted to modify the max y of curves #' base + geom_junction(junction.y.max = 0.5) #' #' # ncp can be used improve the smoothness of curves #' base + geom_junction(junction.y.max = 0.5, ncp = 30) #' #' # junction.orientation controls where the junction are plotted #' # with respect to each transcript #' # either alternating (default), or on the top or bottom #' base + geom_junction(junction.orientation = "top", junction.y.max = 0.5) #' base + geom_junction(junction.orientation = "bottom", junction.y.max = 0.5) #' #' # it can be useful useful to overlay junction curves onto existing annotation #' # plotted using geom_range() and geom_intron() #' base <- sod1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = to_intron(sod1_exons, "transcript_name") #' ) #' #' base + geom_junction( #' data = sod1_junctions, #' junction.y.max = 0.5 #' ) #' #' # as a ggplot2 extension, ggtranscript geoms inherit the #' # the functionality from the parameters and aesthetics in ggplot2 #' # this can be useful when mapping junction thickness to their counts #' base + geom_junction( #' data = sod1_junctions, #' aes(linewidth = mean_count), #' junction.y.max = 0.5, #' colour = "purple" #' ) + #' scale_linewidth(range = c(0.1, 1)) #' #' # it can be useful to combine geom_junction() with geom_half_range() #' sod1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_half_range() + #' geom_intron( #' data = to_intron(sod1_exons, "transcript_name") #' ) + #' geom_junction( #' data = sod1_junctions, #' aes(linewidth = mean_count), #' junction.y.max = 0.5, #' junction.orientation = "top", #' colour = "purple" #' ) + #' scale_linewidth(range = c(0.1, 1)) geom_junction <- function(mapping = NULL, data = NULL, stat = "identity", position = "identity", junction.orientation = "alternating", junction.y.max = 1, angle = 90, ncp = 15, na.rm = FALSE, orientation = NA, show.legend = NA, inherit.aes = TRUE, ...) { ggplot2::layer( data = data, mapping = mapping, stat = stat, geom = GeomJunction, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( junction.orientation = junction.orientation, junction.y.max = junction.y.max, angle = angle, ncp = ncp, na.rm = na.rm, orientation = orientation, ... ) ) } #' @keywords internal #' @noRd GeomJunction <- ggplot2::ggproto("GeomJunction", ggplot2::GeomLine, required_aes = c("xstart", "xend", "y"), setup_data = function(data, params) { # check that junction.orientation is length 1 + one of possible options .check_junction.orientation(params) # check that junction.y.max is length 1 + one of possible options .check_junction.y.max(params) # we need a unique group id per junction, rather than per transcript # similar to spring example from ggplot2 book # https://ggplot2-book.org/spring1.html#spring3 if (is.null(data$group)) { data$group <- seq_len(nrow(data)) } if (anyDuplicated(data$group)) { data$group <- paste(data$group, seq_len(nrow(data)), sep = "-") } # needed to permit usage of xstart/xend transform( data, x = xstart, xstart = NULL ) }, draw_panel = function(data, panel_params, coord, junction.orientation = "alternating", junction.y.max = 1, angle = 90, ncp = 15) { # junction_index represents the order of each junction within tx # needed for junction.orientation = "alternating" data <- data %>% dplyr::group_by(y) %>% dplyr::mutate(junction_index = dplyr::row_number()) %>% dplyr::ungroup() # obtain the actual curves using grid:::calcControlPoints junctions <- .get_junction_curves(data, angle, ncp) # normalise curve points to lie between 0-1 # scale to fit depending on N txs, width of junctions junctions <- .get_normalised_curve( junctions, junction.orientation, junction.y.max ) ggplot2::GeomLine$draw_panel(junctions, panel_params, coord) } ) #' @keywords internal #' @noRd .get_junction_curves <- function(data, angle, ncp) { # very similar to springs example # create the junction points, whilst preserving aes # https://ggplot2-book.org/spring1.html#spring3 # TODO - implementation could probably be vectorised for speed cols_to_keep <- setdiff(names(data), c("x", "xend", "y")) junctions <- lapply(seq_len(nrow(data)), function(i) { junction_curve <- .get_junction_curve( data$x[i], data$xend[i], data$y[i], angle, ncp ) cbind(junction_curve, unclass(data[i, cols_to_keep])) }) junctions <- do.call(rbind, junctions) return(junctions) } #' @keywords internal #' @noRd .get_junction_curve <- function(x, xend, y, angle, ncp) { # creates the points for each curve curve_points <- calcControlPoints( x1 = x, x2 = xend, y1 = y, y2 = y, angle = angle, curvature = -0.5, ncp = ncp ) # need to re-add the original points as these not included # by grid:::calcControlPoints # makes sure junctions curves meet the intron lines junction_curve <- data.frame( x_points = c(x, curve_points$x, xend), y_points = c(y, curve_points$y, y), y_original = y ) %>% dplyr::rename( x = x_points, y = y_points ) return(junction_curve) } #' @keywords internal #' @noRd .get_normalised_curve <- function(junctions, junction.orientation, junction.y.max) { # junction.y.max is equivalent to the max y of each junction curve # each tx is internally uses y an integer # scaling factor (sf) is used normalise the junction curve points sf <- 1 / junction.y.max # each curve point is normalised with relation to the original tx y # first divided by the max(y), meaning all y values lie between 0-1 # then divided by the sf, setting the max y if (junction.orientation == "top") { junctions <- junctions %>% dplyr::mutate( y = ifelse(y == y_original, y, y_original + (y / max(y)) / sf) ) } else if (junction.orientation == "bottom") { junctions <- junctions %>% dplyr::mutate( y = ifelse(y == y_original, y, y_original - (y / max(y)) / sf) ) } else if (junction.orientation == "alternating") { junctions <- junctions %>% dplyr::mutate(y = dplyr::case_when( y == y_original ~ y, junction_index %% 2 == 0 ~ y_original - (y / max(y) / sf), junction_index %% 2 == 1 ~ y_original + (y / max(y) / sf) )) } return(junctions) } #' @keywords internal #' @noRd .check_junction.orientation <- function(params) { not_orient_option <- !(params$junction.orientation %in% c("alternating", "top", "bottom")) if (not_orient_option) { stop( "junction.orientation must be one of ", "'alternating', 'top' or 'bottom'" ) } } #' @keywords internal #' @noRd .check_junction.y.max <- function(params) { if (length(params$junction.y.max) != 1) { stop( "junction.y.max must have a length of 1" ) } if (!is.numeric(params$junction.y.max)) { stop( "junction.y.max must be a numeric value (integer/double)" ) } } calcControlPoints <- grid:::calcControlPoints ================================================ FILE: R/geom_junction_label_repel.R ================================================ #' Label junction curves #' #' `geom_junction_label_repel()` labels junction curves at their midpoint using #' `ggrepel::geom_label_repel()`. This can be useful to label and compare #' junctions (plotted using `geom_junction()`) with metrics of their usage (e.g. #' read counts or percent-spliced-in). #' #' `geom_junction_label_repel()` requires the following `aes()`; `xstart`, #' `xend`, `y` (e.g. transcript name) and `label`. Under the hood, #' `geom_junction_label_repel()` generates the same junction curves as #' `geom_junction()` to obtain curve midpoints for labeling. Therefore, it is #' important that users use the same input data and parameters that alter #' junction curves (namely `junction.orientation`, `junction.y.max`, `angle`, #' `ncp`) for `geom_junction_label_repel()` that they have used for #' `geom_junction()`. #' #' @inheritParams ggrepel::geom_text_repel #' @inheritParams grid::curveGrob #' @inheritParams geom_junction #' #' @return the return value of a `geom_*` function is not intended to be #' directly handled by users. Therefore, `geom_*` functions should never be #' executed in isolation, rather used in combination with a #' `ggplot2::ggplot()` call. #' #' @export #' @examples #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' sod1_annotation %>% head() #' #' # as well as a set of example (unannotated) junctions #' # originating from GTEx and downloaded via the Bioconductor package snapcount #' sod1_junctions #' #' # extract exons #' sod1_exons <- sod1_annotation %>% dplyr::filter( #' type == "exon", #' transcript_name == "SOD1-201" #' ) #' sod1_exons %>% head() #' #' # add transcript_name to junctions for plotting #' sod1_junctions <- sod1_junctions %>% #' dplyr::mutate(transcript_name = "SOD1-201") #' #' # geom_junction_label_repel() can be used to label junctions #' base <- sod1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = to_intron(sod1_exons, "transcript_name") #' ) #' #' # this can be useful to label junctions with their counts #' base + #' geom_junction( #' data = sod1_junctions, #' junction.y.max = 0.5 #' ) + #' geom_junction_label_repel( #' data = sod1_junctions, #' aes(label = round(mean_count, 2)), #' junction.y.max = 0.5 #' ) geom_junction_label_repel <- function(mapping = NULL, data = NULL, stat = "identity", position = "identity", parse = FALSE, ..., junction.orientation = "alternating", junction.y.max = 1, angle = 90, ncp = 15, box.padding = 0.25, label.padding = 0.25, point.padding = 1e-6, label.r = 0.15, label.size = 0.25, min.segment.length = 0, arrow = NULL, force = 1, force_pull = 1, max.time = 0.5, max.iter = 10000, max.overlaps = getOption("ggrepel.max.overlaps", default = 10), nudge_x = 0, nudge_y = 0, xlim = c(NA, NA), ylim = c(NA, NA), na.rm = FALSE, show.legend = NA, direction = c("both", "y", "x"), seed = NA, verbose = FALSE, inherit.aes = TRUE) { if (!missing(nudge_x) || !missing(nudge_y)) { if (!missing(position)) { stop("Specify either `position` or `nudge_x`/`nudge_y`", call. = FALSE) } position <- position_nudge_repel(nudge_x, nudge_y) } ggplot2::layer( data = data, mapping = mapping, stat = stat, geom = GeomJunctionLabelRepel, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( parse = parse, junction.orientation = junction.orientation, junction.y.max = junction.y.max, angle = angle, ncp = ncp, box.padding = to_unit(box.padding), label.padding = to_unit(label.padding), point.padding = to_unit(point.padding), label.r = to_unit(label.r), label.size = label.size, min.segment.length = to_unit(min.segment.length), arrow = arrow, na.rm = na.rm, force = force, force_pull = force_pull, max.time = max.time, max.iter = max.iter, max.overlaps = max.overlaps, nudge_x = nudge_x, nudge_y = nudge_y, xlim = xlim, ylim = ylim, direction = match.arg(direction), seed = seed, verbose = verbose, ... ) ) } #' @include geom_junction.R #' @keywords internal #' @noRd GeomJunctionLabelRepel <- ggplot2::ggproto( "GeomJunctionLabelRepel", ggrepel::GeomLabelRepel, required_aes = c("xstart", "xend", "y", "label"), # copied from ggrepel::GeomLabelRepel with segment.colour and segment.alpha # defaults set to appropriate values, rather than NULL # this avoid warnings e.g. Unknown or uninitialised column: `segment.alpha` # but does cause issues when setting e.g. aes(colour = tx) # TODO - resolve either warning or make segment.colour borrow colour aes default_aes = aes( colour = "black", fill = "white", size = 3.88, angle = 0, alpha = NA, family = "", fontface = 1, lineheight = 1.2, hjust = 0.5, vjust = 0.5, point.size = 1, segment.linetype = 1, segment.colour = "black", segment.size = 0.5, segment.alpha = NA, segment.curvature = 0, segment.angle = 90, segment.ncp = 1, segment.shape = 0.5, segment.square = TRUE, segment.squareShape = 1, segment.inflect = FALSE, segment.debug = FALSE ), setup_data = GeomJunction$setup_data, draw_panel = function(data, panel_scales, coord, parse = FALSE, na.rm = FALSE, junction.orientation = "alternating", junction.y.max = 1, angle = 90, ncp = 15, box.padding = 0.25, label.padding = 0.25, point.padding = 1e-6, label.r = 0.15, label.size = 0.25, min.segment.length = 0, arrow = NULL, force = 1, force_pull = 1, max.time = 0.5, max.iter = 10000, max.overlaps = 10, nudge_x = 0, nudge_y = 0, xlim = c(NA, NA), ylim = c(NA, NA), direction = "both", seed = NA, verbose = FALSE) { # junction_index represents the order of each junction within tx # needed for junction.orientation = "alternating" data <- data %>% dplyr::group_by(y) %>% dplyr::mutate(junction_index = dplyr::row_number()) %>% dplyr::ungroup() # obtain the midpoints of junction curves (where we want label) junction_midpoints <- to_junction_midpoints( data, angle, ncp, junction.orientation, junction.y.max ) ggrepel::GeomLabelRepel$draw_panel( data = junction_midpoints, panel_scales = panel_scales, coord = coord, parse = parse, na.rm = na.rm, box.padding = box.padding, label.padding = label.padding, point.padding = point.padding, label.r = label.r, label.size = label.size, min.segment.length = min.segment.length, arrow = arrow, force = force, force_pull = force_pull, max.time = max.time, max.iter = max.iter, max.overlaps = max.overlaps, nudge_x = nudge_x, nudge_y = nudge_y, xlim = xlim, ylim = ylim, direction = direction, seed = seed, verbose = verbose ) } ) #' Wrapper for obtaining junction curve midpoints #' #' @keywords internal #' @noRd to_junction_midpoints <- function(data, angle, ncp, junction.orientation, junction.y.max) { # TODO - maybe export this as helper? junctions <- .get_junction_curves(data, angle, ncp) junctions <- .get_normalised_curve( junctions, junction.orientation, junction.y.max ) junction_midpoints <- .get_curve_midpoints(junctions) return(junction_midpoints) } #' @keywords internal #' @noRd .get_curve_midpoints <- function(junctions) { # get the mid points of each curve for labeling junctions # these are the points with the x value closest to median(x) # this cannot be == median(x), this will not pick up point for even ncp's junctions_mid <- junctions %>% dplyr::group_by(group) %>% dplyr::mutate( median_x = stats::median(x), median_diff = abs(x - median_x) ) %>% dplyr::filter(median_diff == min(median_diff)) %>% dplyr::ungroup() %>% dplyr::select(-median_x, -median_diff) return(junctions_mid) } to_unit <- ggrepel:::to_unit ================================================ FILE: R/geom_range.R ================================================ #' Plot genomic ranges #' #' `geom_range()` and `geom_half_range()` draw tiles that are designed to #' represent range-based genomic features, such as exons. In combination with #' `geom_intron()`, these geoms form the core components for visualizing #' transcript structures. #' #' `geom_range()` and `geom_half_range()` require the following `aes()`; #' `xstart`, `xend` and `y` (e.g. transcript name). `geom_half_range()` takes #' advantage of the vertical symmetry of transcript annotation by plotting only #' half of a range on the top or bottom of a transcript structure. This can be #' useful for comparing between two transcripts or free up plotting space for #' other transcript annotations (e.g. `geom_junction()`). #' #' @inheritParams ggplot2::layer #' @inheritParams ggplot2::geom_point #' @inheritParams ggplot2::geom_tile #' @inheritParams ggplot2::geom_segment #' @inheritParams grid::rectGrob #' #' @return the return value of a `geom_*` function is not intended to be #' directly handled by users. Therefore, `geom_*` functions should never be #' executed in isolation, rather used in combination with a #' `ggplot2::ggplot()` call. #' #' @export #' @examples #' #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' sod1_annotation %>% head() #' #' # extract exons #' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") #' sod1_exons %>% head() #' #' base <- sod1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) #' #' # geom_range() is designed to visualise range-based annotation such as exons #' base + geom_range() #' #' # geom_half_range() allows users to plot half ranges #' # on the top or bottom of the transcript #' base + geom_half_range() #' #' # where the half ranges are plotted can be adjusted using range.orientation #' base + geom_half_range(range.orientation = "top") #' #' # as a ggplot2 extension, ggtranscript geoms inherit the #' # the functionality from the parameters and aesthetics in ggplot2 #' base + geom_range( #' aes(fill = transcript_name), #' linewidth = 1 #' ) #' #' # together, geom_range() and geom_intron() are designed to visualize #' # the core components of transcript annotation #' base + geom_range( #' aes(fill = transcript_biotype) #' ) + #' geom_intron( #' data = to_intron(sod1_exons, "transcript_name") #' ) #' #' # for protein coding transcripts #' # geom_range() be useful for visualizing UTRs that lie outside of the CDS #' sod1_exons_prot_coding <- sod1_exons %>% #' dplyr::filter(transcript_biotype == "protein_coding") #' #' # extract cds #' sod1_cds <- sod1_annotation %>% #' dplyr::filter(type == "CDS") #' #' sod1_exons_prot_coding %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range( #' fill = "white", #' height = 0.25 #' ) + #' geom_range( #' data = sod1_cds #' ) + #' geom_intron( #' data = to_intron(sod1_exons_prot_coding, "transcript_name") #' ) #' #' # geom_half_range() can be useful for comparing between two transcripts #' # enabling visualization of one transcript on the top, other on the bottom #' sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201") #' sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201") #' sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202") #' sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202") #' #' sod1_201_plot <- sod1_201_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = "SOD1-201/202" #' )) + #' geom_half_range( #' fill = "white", #' height = 0.125 #' ) + #' geom_half_range( #' data = sod1_201_cds #' ) + #' geom_intron( #' data = to_intron(sod1_201_exons, "transcript_name") #' ) #' #' sod1_201_plot #' #' sod1_201_202_plot <- sod1_201_plot + #' geom_half_range( #' data = sod1_202_exons, #' range.orientation = "top", #' fill = "white", #' height = 0.125 #' ) + #' geom_half_range( #' data = sod1_202_cds, #' range.orientation = "top", #' fill = "purple" #' ) + #' geom_intron( #' data = to_intron(sod1_202_exons, "transcript_name") #' ) #' #' sod1_201_202_plot #' #' # leveraging existing ggplot2 functionality via e.g. coord_cartesian() #' # can be useful to zoom in on areas of interest #' sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000)) geom_range <- function(mapping = NULL, data = NULL, stat = "identity", position = "identity", ..., vjust = NULL, linejoin = "mitre", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) { ggplot2::layer( data = data, mapping = mapping, stat = stat, geom = GeomRange, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( vjust = vjust, linejoin = linejoin, na.rm = na.rm, ... ) ) } #' `GeomRange` is `ggplot2::GeomTile` with modified `aes` to match genetic #' nomenclature (`xstart`/`xend`) #' @keywords internal #' @noRd GeomRange <- ggplot2::ggproto("GeomRange", ggplot2::GeomTile, required_aes = c("xstart", "xend", "y"), default_aes = aes( fill = "grey", colour = "black", linewidth = 0.25, linetype = 1, alpha = NA, height = NA ), setup_data = function(data, params) { # modified from ggplot2::GeomTile data$height <- data$height %||% params$height %||% 0.5 transform( data, xmin = xstart, xmax = xend, ymin = y - height / 2, ymax = y + height / 2, height = NULL ) }, draw_panel = function(self, data, panel_params, coord, vjust = NULL, lineend = "butt", linejoin = "mitre") { if (!coord$is_linear()) { # prefer to match geom_curve and warn # rather than copy the implementation from GeomRect for simplicity # also don'think geom_range would be used for non-linear coords warn("geom_ is not implemented for non-linear coordinates") } coords <- coord$transform(data, panel_params) grid::rectGrob( coords$xmin, coords$ymax, width = coords$xmax - coords$xmin, height = coords$ymax - coords$ymin, default.units = "native", just = c("left", "top"), vjust = vjust, gp = grid::gpar( col = coords$colour, fill = ggplot2::alpha(coords$fill, coords$alpha), lwd = coords$linewidth * ggplot2::.pt, lty = coords$linetype, linejoin = linejoin, lineend = lineend ) ) } ) ================================================ FILE: R/ggtranscript-package.R ================================================ #' `ggtranscript`: Visualizing transcript structure and annotation using #' `ggplot2` #' #' The goal of `ggtranscript` is the simplify the process of visualizing #' transcript structure and annotation. To achieve this, `ggtranscript` #' introduces 5 new geoms (`geom_range()`, `geom_half_range()`, `geom_intron()`, #' `geom_junction()` and `geom_junction_label_repel()`) as well as several #' helper functions. As a `ggplot2` extension, `ggtranscript` inherits #' `ggplot2`'s familiarity and flexibility, enabling users to intuitively adjust #' aesthetics, parameters, scales etc as well as complement `ggtranscript` geoms #' with existing `ggplot2` geoms to create informative, publication-ready plots. #' #' @docType package #' @name ggtranscript "_PACKAGE" #' @importFrom rlang %||% #' @importFrom magrittr %>% #' @importFrom ggplot2 aes #' @import ggrepel NULL ================================================ FILE: R/globals.R ================================================ # bypass R CMD Check notes, related to tidyverse non-standard evaluation # https://www.r-bloggers.com/2019/08/no-visible-binding-for-global-variable/ utils::globalVariables(c( "x", "start", "end", ":=", "intron_start", "intron_end", "exons", "xend", "mid", "index", "diff_type", "in_exons", "in_ref_exons", ".", # shorten_gaps() "width", "rescaled_start", "rescaled_end", "width_tx_start", "seqnames", "strand", "shorten_type", "gap_width", "shortened_gap_width", "shortened_gap_diff", "sum_shortened_gap_diff", "intron_indexes", "shortened_width", "type", # add_exon_number() "exon_number", "n_exons", # geom_junction_label_repel() "group", "median", "median_x", "median_diff", "x_points", "y_points", "y", "y_original", "position_nudge_repel", # add_utr(), "dummy_group" )) ================================================ FILE: R/shorten_gaps.R ================================================ #' Improve transcript structure visualization by shortening gaps #' #' For a given set of exons and introns, `shorten_gaps()` reduces the width of #' gaps (regions that do not overlap any `exons`) to a user-inputted #' `target_gap_width`. This can be useful when visualizing transcripts that have #' long introns, to hone in on the regions of interest (i.e. exons) and better #' compare between transcript structures. #' #' After `shorten_gaps()` reduces the size of gaps, it will re-scale `exons` and #' `introns` to preserve exon alignment. This process will only reduce the width #' of input `introns`, never `exons`. Importantly, the outputted re-scaled #' co-ordinates should only be used for visualization as they will not match the #' original genomic coordinates. #' #' @inheritParams to_diff #' @param introns `data.frame()` the intron co-ordinates corresponding to the #' input `exons`. This can be created by applying `to_intron()` to the #' `exons`. If introns originate from multiple transcripts, they must be #' differentiated using `group_var`. If a user is not using `to_intron()`, #' they must make sure intron start/ends are defined precisely as the adjacent #' exon boundaries (rather than exon end + 1 and exon start - 1). #' @param target_gap_width `integer()` the width in base pairs to shorten the #' gaps to. #' #' @return `data.frame()` contains the re-scaled co-ordinates of `introns` and #' `exons` of each input transcript with shortened gaps. #' #' @export #' @examples #' #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' pknox1_annotation %>% head() #' #' # extract exons #' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") #' pknox1_exons %>% head() #' #' # to_intron() is a helper function included in ggtranscript #' # which is useful for converting exon co-ordinates to introns #' pknox1_introns <- pknox1_exons %>% to_intron(group_var = "transcript_name") #' pknox1_introns %>% head() #' #' # for transcripts with long introns, the exons of interest #' # can be difficult to visualize clearly when using the default scale #' pknox1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = pknox1_introns, #' arrow.min.intron.length = 3500 #' ) #' #' # in such cases it can be useful to rescale the exons and introns #' # using shorten_gaps() which shortens regions that do not overlap an exon #' pknox1_rescaled <- #' shorten_gaps(pknox1_exons, pknox1_introns, group_var = "transcript_name") #' #' pknox1_rescaled %>% head() #' #' # this allows us to visualize differences in exonic structure more clearly #' pknox1_rescaled %>% #' dplyr::filter(type == "exon") %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = pknox1_rescaled %>% dplyr::filter(type == "intron"), #' arrow.min.intron.length = 300 #' ) #' #' # shorten_gaps() can be used in combination with to_diff() #' # to further highlight differences in exon structure #' # here, all other transcripts are compared to the MANE-select transcript #' pknox1_rescaled_diffs <- to_diff( #' exons = pknox1_rescaled %>% #' dplyr::filter(type == "exon", transcript_name != "PKNOX1-201"), #' ref_exons = pknox1_rescaled %>% #' dplyr::filter(type == "exon", transcript_name == "PKNOX1-201"), #' group_var = "transcript_name" #' ) #' #' pknox1_rescaled %>% #' dplyr::filter(type == "exon") %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = pknox1_rescaled %>% dplyr::filter(type == "intron"), #' arrow.min.intron.length = 300 #' ) + #' geom_range( #' data = pknox1_rescaled_diffs, #' aes(fill = diff_type), #' alpha = 0.2 #' ) shorten_gaps <- function(exons, introns, group_var = NULL, target_gap_width = 100L) { # input checks .check_coord_object(exons, check_seqnames = TRUE, check_strand = TRUE) .check_coord_object(introns, check_seqnames = TRUE, check_strand = TRUE) .check_group_var(exons, group_var) .check_group_var(introns, group_var) target_gap_width <- .check_target_gap_width(target_gap_width) # check type column, create if not present exons <- .get_type(exons, "exons") introns <- .get_type(introns, "introns") # to_intron() defines introns using the exon boundaries # we need to convert this to the actual gap definition to make sure # comparison to GenomicRanges::gaps() when using "equal" works correctly # this is converted back in .get_rescaled_txs() introns <- introns %>% dplyr::mutate( start = start + 1, end = end - 1 ) # we use GenomicRanges methods for downstream processing exons_gr <- GenomicRanges::GRanges(exons) introns_gr <- GenomicRanges::GRanges(introns) # obtain actual gaps, i.e. regions that overlap no exons intron_gaps <- .get_gaps(exons_gr) # by mapping gaps back to introns, we can then shorten overlapping gaps gap_map_intron <- .get_gap_map(introns_gr, intron_gaps) introns_shortened <- .get_shortened_gaps( introns, intron_gaps, gap_map_intron, group_var, target_gap_width ) # don't have to take tx_start_gaps into account if only 1 tx if (!is.null(group_var)) { # because we're shortening intron_gaps, we also need to shorten the # region from start of the plot and start of each tx (tx_start_gaps) tx_start_gaps <- .get_tx_start_gaps(exons, group_var) gap_map_tx_start_gaps <- .get_gap_map( tx_start_gaps %>% GenomicRanges::GRanges(), intron_gaps ) tx_start_gaps_shortened <- .get_shortened_gaps( tx_start_gaps, intron_gaps, gap_map_tx_start_gaps, group_var, target_gap_width ) %>% dplyr::select(-start, -end, -strand, -seqnames, -strand) } rescaled_tx <- .get_rescaled_txs( exons, introns_shortened, tx_start_gaps_shortened, group_var ) return(rescaled_tx) } #' Add a type column if it is not present already #' #' @keywords internal #' @noRd .get_type <- function(x, exons_introns) { if (!is.null(x[["type"]])) { # if there is an existing type column for introns # need to make sure this is all "intron" for downstream functions # don't check for exons, as this can be variable (e.g. "five_prime_utr") if (exons_introns == "introns") { allowed_types <- "intron" if (!all(x[["type"]] %in% allowed_types)) { stop( "values in the 'type' column of ", exons_introns, " must be one of: ", allowed_types %>% paste0("'", ., "'") %>% paste(collapse = ", ") ) } } } else { # if there isn't, we add a default type column default_type <- ifelse(exons_introns == "exons", "exon", "intron") x <- x %>% dplyr::mutate(type = default_type) } return(x) } #' @keywords internal #' @noRd .get_gaps <- function(exons_gr) { orig_seqnames <- exons_gr %>% GenomicRanges::seqnames() %>% as.character() %>% unique() orig_strand <- exons_gr %>% GenomicRanges::strand() %>% as.character() %>% unique() # make sure we only have exons from a single transcript .check_len_1_strand_seqnames(orig_seqnames, orig_strand) # "reduce" exons - here meaning to collapse into single meta transcript exons_gr_reduced <- exons_gr %>% GenomicRanges::reduce() # keep only the relevant seqnames, otherwise gaps includes all seqlevels GenomeInfoDb::seqlevels(exons_gr_reduced, pruning.mode = "coarse") <- orig_seqnames # obtain intronic gaps of the meta transcript intron_gaps <- exons_gr_reduced %>% GenomicRanges::gaps( start = min(GenomicRanges::start(exons_gr_reduced)), end = max(GenomicRanges::end(exons_gr_reduced)) ) # gaps creates a gap per strand too, keep only those from the original strand intron_gaps <- intron_gaps %>% .[GenomicRanges::strand(intron_gaps) == orig_strand] return(intron_gaps) } #' @keywords internal #' @noRd .get_tx_start_gaps <- function(exons, group_var) { # need to scale the transcript starts so scaled introns/exons align # importantly, this tx start also has to take into account # whether intron_gaps that overlap it have been shortened # here, get the tx_start_gap - the region between # 1. the start of plot (smallest start position of all txs) # 2. the start of each tx tx_start_gaps <- exons %>% dplyr::group_by_at(.vars = c( group_var )) %>% dplyr::summarise( seqnames = unique(seqnames), strand = unique(strand), end = min(start), # min start of this transcript start = min(exons[["start"]]) # min start of all transcripts ) return(tx_start_gaps) } #' map the gaps back to introns/transcript start gaps #' @keywords internal #' @noRd .get_gap_map <- function(y, intron_gaps) { # when we reduce the length of the intron_gaps, whilst making sure # whilst making sure the exons/introns remain aligned # to do this, we need to map the intron_gaps back onto the introns # the simplest case is when gaps are identical to original introns equal_hits <- GenomicRanges::findOverlaps( intron_gaps, y, type = "equal" ) # often, the intron_gaps don't map identically # this occurs due to the exons of one tx overlapping the intron of another # we find cases when the gaps are completely contained an original intron # using type = "within", but this also catches the "equal" intron_gaps within_hits <- GenomicRanges::findOverlaps( intron_gaps, y, type = "within" ) # convert to data.frame() to use dplyr::anti_join() equal_hits <- equal_hits %>% as.data.frame() within_hits <- within_hits %>% as.data.frame() # remove the "equal" hits from the "within" pure_within_hits <- within_hits %>% dplyr::anti_join(equal_hits, by = c("queryHits", "subjectHits")) # need both "equal" and "pure_within" hits gap_map <- list( equal = equal_hits, pure_within = pure_within_hits ) return(gap_map) } #' @keywords internal #' @noRd .get_shortened_gaps <- function(y, intron_gaps, gap_map, group_var, target_gap_width) { # we need the intron/tx_start_gap widths (to shorten them) y <- y %>% dplyr::mutate(width = (end - start) + 1) # characterise introns by shortening type y_shortened <- y %>% dplyr::mutate( shorten_type = dplyr::case_when( dplyr::row_number() %in% gap_map[["equal"]][["subjectHits"]] ~ "equal", dplyr::row_number() %in% gap_map[["pure_within"]][["subjectHits"]] ~ "pure_within", TRUE ~ "none" ) ) # for the "equal" cases, simply shorten the widths to the target_gap_width y_shortened <- y_shortened %>% dplyr::mutate( shortened_width = ifelse( (shorten_type == "equal") & (width > target_gap_width), target_gap_width, width ) ) # for the "within" cases we need to shorten the intron widths # by the !total! amount the overlapping gaps are shortened overlapping_gap_indexes <- gap_map[["pure_within"]][["queryHits"]] # only have to this if there are gaps that are "pure_within" if (length(overlapping_gap_indexes) > 0) { # one intron may overlap multiple gaps # first, calculate the sum of the reduction in gap widths sum_gap_diff <- dplyr::tibble( intron_indexes = gap_map[["pure_within"]][["subjectHits"]], gap_width = GenomicRanges::width(intron_gaps)[overlapping_gap_indexes] ) %>% dplyr::mutate( shortened_gap_width = ifelse( gap_width > target_gap_width, target_gap_width, gap_width ), shortened_gap_diff = gap_width - shortened_gap_width, ) %>% dplyr::group_by(intron_indexes) %>% dplyr::summarise( sum_shortened_gap_diff = sum(shortened_gap_diff) ) # now actually do reduction for introns with "pure_within" gaps y_shortened[["sum_shortened_gap_diff"]] <- NA_integer_ y_shortened[["sum_shortened_gap_diff"]][sum_gap_diff[["intron_indexes"]]] <- sum_gap_diff[["sum_shortened_gap_diff"]] y_shortened <- y_shortened %>% dplyr::mutate( shortened_width = ifelse( is.na(sum_shortened_gap_diff), shortened_width, width - sum_shortened_gap_diff ) ) %>% dplyr::select(-sum_shortened_gap_diff) } # remove unecessary intermediate cols y_shortened <- y_shortened %>% dplyr::select( -shorten_type, -width, width = shortened_width ) return(y_shortened) } #' @keywords internal #' @noRd .get_rescaled_txs <- function(exons, introns_shortened, tx_start_gaps_shortened, group_var) { # calculate the rescaled exon/intron start/ends using # the widths of the exons and reduced introns rescaled_tx <- exons %>% dplyr::mutate( width = (end - start) + 1 ) # bind together exons and introns and arrange into genomic order rescaled_tx <- rescaled_tx %>% dplyr::bind_rows( introns_shortened ) %>% dplyr::arrange_at(.vars = c(group_var, "start", "end")) # calculate the rescaled coords using cumsum of the widths of introns/exons rescaled_tx <- rescaled_tx %>% dplyr::group_by_at(.vars = c( group_var )) %>% dplyr::mutate( rescaled_end = cumsum(width), rescaled_start = rescaled_end - (width - 1) ) %>% dplyr::ungroup() # account for the tx starts being in different places # to keep everything aligned if (is.null(group_var)) { # if only 1 tx, we use 1 as the dummy rescaled tx_start rescaled_tx <- rescaled_tx %>% dplyr::mutate(width_tx_start = 1) } else { rescaled_tx <- rescaled_tx %>% dplyr::left_join( tx_start_gaps_shortened, by = c(group_var), suffix = c("", "_tx_start") ) } rescaled_tx <- rescaled_tx %>% dplyr::mutate( rescaled_end = rescaled_end + width_tx_start, rescaled_start = rescaled_start + width_tx_start ) %>% dplyr::select(-dplyr::contains("width")) # convert introns back to be defined by exon boundaries, match to_intron() rescaled_tx <- rescaled_tx %>% dplyr::mutate( start = ifelse(type == "intron", start - 1, start), end = ifelse(type == "intron", end + 1, end), rescaled_start = ifelse( type == "intron", rescaled_start - 1, rescaled_start ), rescaled_end = ifelse( type == "intron", rescaled_end + 1, rescaled_end ) ) # remove original start/end rescaled_tx <- rescaled_tx %>% dplyr::select(-start, -end) rescaled_tx <- rescaled_tx %>% dplyr::select( seqnames, start = rescaled_start, end = rescaled_end, strand, dplyr::everything() ) return(rescaled_tx) } #' we expect the exons to originate from a single gene. #' therefore, unique strand and seqnames should be of length 1 #' @keywords internal #' @noRd .check_len_1_strand_seqnames <- function(orig_seqnames, orig_strand) { ab_1_uniq <- "of object contains more than 1 unique value. " reason <- "object is expected to contain exons from a single gene." if (length(orig_seqnames) != 1) { stop("seqnames ", ab_1_uniq, reason) } if (length(orig_strand) != 1) { stop("strand ", ab_1_uniq, reason) } } #' @keywords internal #' @noRd .check_target_gap_width <- function(target_gap_width) { if (!is.integer(target_gap_width)) { warning("target_gap_width must be an integer, coercing...") target_gap_width <- target_gap_width %>% as.integer() } return(target_gap_width) } ================================================ FILE: R/to_diff.R ================================================ #' Obtain the differences between transcript structure #' #' `to_diff()` obtains the difference between `exons` from a set of transcripts #' to a reference transcript (`ref_exons`). This can be useful when visualizing #' the differences between transcript structure. `to_diff()` expects two sets of #' input exons; 1. `exons` - exons from any number of transcripts that will be #' compared to `ref_exons` and 2. `ref_exons` - exons from a single transcript #' which acts as the reference to compare against. #' #' @param exons `data.frame()` contains exons which can originate from multiple #' transcripts differentiated by `group_var`. #' @param ref_exons `data.frame()` contains exons that originate from a single #' transcript, which `exons` will be compared against. #' @param group_var `character()` if input data originates from more than 1 #' transcript, `group_var` must specify the column that differentiates #' transcripts (e.g. "transcript_id"). #' #' @return `data.frame()` details the differences between `exons` and #' `ref_exons`. #' #' @export #' @examples #' #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' sod1_annotation %>% head() #' #' # extract exons #' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") #' sod1_exons %>% head() #' #' # for this example, let's compare transcripts to the MANE-select transcript #' sod1_mane <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201") #' sod1_not_mane <- sod1_exons %>% dplyr::filter(transcript_name != "SOD1-201") #' #' # to_diff() obtains the differences between the exons as ranges #' sod1_diffs <- to_diff( #' exons = sod1_not_mane, #' ref_exons = sod1_mane, #' group_var = "transcript_name" #' ) #' #' sod1_diffs %>% head() #' #' # using geom_range(), it can be useful to visually overlay #' # the differences on top of the transcript annotation #' sod1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = to_intron(sod1_exons, "transcript_name") #' ) + #' geom_range( #' data = sod1_diffs, #' ggplot2::aes(fill = diff_type), #' alpha = 0.2 #' ) to_diff <- function(exons, ref_exons, group_var = NULL) { .check_coord_object(exons, check_seqnames = TRUE, check_strand = TRUE) .check_coord_object(ref_exons, check_seqnames = TRUE, check_strand = TRUE) .check_group_var(exons, group_var) # need to remember if group is NULL for downstream null_group <- is.null(group_var) # we have to create dummy group if there is no group for .get_diff if (null_group) { exons <- exons %>% dplyr::mutate(dummy_group = "A") group_var <- "dummy_group" } diffs <- .get_diff(exons, ref_exons, group_var) # remove the dummy_group if created if (null_group) diffs[[group_var]] <- NULL return(diffs) } #' The heavy lifting of `to_diff()` happens here. #' #' @keywords internal #' @noRd .get_diff <- function(exons, ref_exons, group_var) { groups <- exons[[group_var]] %>% unique() # needs to be a genomic range for downstream processing exons_gr <- GenomicRanges::GRanges(exons) ref_exons_gr <- GenomicRanges::GRanges(ref_exons) diffs <- vector("list", length = length(group_var)) for (i in seq_along(groups)) { exons_gr_curr <- exons_gr %>% .[GenomicRanges::mcols(exons_gr)[[group_var]] == groups[i]] # get the disjoint pieces (flattening and breaking apart exons) disjoint_pieces <- GenomicRanges::disjoin( c(ref_exons_gr, exons_gr_curr) ) # find whether the disjoint pieces overlap exons or ref_exons # those that only overlap 1 are the differences # TODO - perhaps allow modification of findOverlaps() via ... ? overlap_exons <- GenomicRanges::findOverlaps( disjoint_pieces, exons_gr_curr ) overlap_ref_exons <- GenomicRanges::findOverlaps( disjoint_pieces, ref_exons_gr ) # convert pieces back to data.frame and classify diffs # TODO - could improve efficiency by placing this step post-loop # i.e. manipulate the grs instead diff_curr <- disjoint_pieces %>% as.data.frame() %>% dplyr::mutate( index = dplyr::row_number(), type = "diff", in_exons = index %in% S4Vectors::queryHits(overlap_exons), in_ref_exons = index %in% S4Vectors::queryHits(overlap_ref_exons) ) %>% dplyr::mutate( diff_type = dplyr::case_when( in_exons & in_ref_exons ~ "both", in_exons & !in_ref_exons ~ "not_in_ref", !in_exons & in_ref_exons ~ "in_ref" ) ) # add back in group info diff_curr[[group_var]] <- groups[i] # keep only diffs and necessary cols diffs[[i]] <- diff_curr %>% dplyr::filter(diff_type != "both") %>% dplyr::select(-in_exons, -in_ref_exons, -index) } diffs <- diffs %>% do.call(dplyr::bind_rows, .) return(diffs) } ================================================ FILE: R/to_intron.R ================================================ #' Convert exon co-ordinates to introns #' #' Given a set of `exons`, `to_intron()` will return the corresponding introns. #' #' It is important to note that, for visualization purposes, `to_intron()` #' defines introns precisely as the exon boundaries, rather than the intron #' start/end being (exon end + 1)/(exon start - 1). #' #' @inheritParams to_diff #' #' @return `data.frame()` contains the intron co-ordinates. #' #' @export #' @examples #' library(magrittr) #' library(ggplot2) #' #' # to illustrate the package's functionality #' # ggtranscript includes example transcript annotation #' sod1_annotation %>% head() #' #' # extract exons #' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") #' sod1_exons %>% head() #' #' # to_intron() is a helper function included in ggtranscript #' # which is useful for converting exon co-ordinates to introns #' sod1_introns <- sod1_exons %>% to_intron(group_var = "transcript_name") #' sod1_introns %>% head() #' #' # this can be particular useful when combined with #' # geom_range() and geom_intron() #' # to visualize the core components of transcript annotation #' sod1_exons %>% #' ggplot(aes( #' xstart = start, #' xend = end, #' y = transcript_name #' )) + #' geom_range() + #' geom_intron( #' data = to_intron(sod1_exons, "transcript_name") #' ) to_intron <- function(exons, group_var = NULL) { .check_coord_object(exons) .check_group_var(exons, group_var) # TODO - switch this to using GenomicRanges::gaps()? if (!is.null(group_var)) { exons <- exons %>% dplyr::group_by_at(.vars = group_var) } # make sure exons are arranged by coord, so that dplyr::lag works correctly exons <- exons %>% dplyr::arrange(start, end) # obtain intron start and ends introns <- exons %>% dplyr::mutate( intron_start := dplyr::lag(end), intron_end := start, type = "intron" ) %>% dplyr::select(-start, -end) # remove the introduced artifact NAs introns <- introns %>% dplyr::ungroup() %>% dplyr::filter(!is.na(intron_start) & !is.na(intron_end)) # filter out introns with a width of 1, this should only happen when # utrs are included and are directly adjacent to end of cds introns <- introns %>% dplyr::filter(abs(intron_end - intron_start) != 1) introns <- introns %>% dplyr::rename(start = intron_start, end = intron_end) return(introns) } ================================================ FILE: R/utils.R ================================================ #' @keywords internal #' @noRd .check_coord_object <- function(x, check_seqnames = FALSE, check_strand = FALSE) { if (!is.data.frame(x)) { stop( "object must be a data.frame. ", "GRanges objects are currently not supported and must be converted ", "using e.g. as.data.frame()" ) } if (!all(c("start", "end") %in% colnames(x))) { stop("object must have the columns 'start' and 'end'") } if (check_seqnames) { if (!("seqnames" %in% colnames(x))) { stop("object must have the column 'seqnames'") } } if (check_strand) { if (!("strand" %in% colnames(x))) { stop("object must have the column 'strand'") } } } #' @keywords internal #' @noRd .check_group_var <- function(x, group_var) { if (!is.null(group_var)) { if (!all(group_var %in% colnames(x))) { stop( "group_var ('", group_var, "') ", "must be a column in object" ) } } } ================================================ FILE: README.Rmd ================================================ --- output: github_document --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%", dpi = 300 ) ``` # ggtranscript [![GitHub issues](https://img.shields.io/github/issues/dzhang32/ggtranscript)](https://github.com/dzhang32/ggtranscript/issues) [![GitHub pulls](https://img.shields.io/github/issues-pr/dzhang32/ggtranscript)](https://github.com/dzhang32/ggtranscript/pulls) [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) [![R-CMD-check-bioc](https://github.com/dzhang32/ggtranscript/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/dzhang32/ggtranscript/actions) [![Codecov test coverage](https://codecov.io/gh/dzhang32/ggtranscript/branch/main/graph/badge.svg)](https://app.codecov.io/gh/dzhang32/ggtranscript?branch=main) `ggtranscript` is a `ggplot2` extension that makes it to easy to visualize transcript structure and annotation. ## Installation ```{r "install_dev", eval = FALSE} # you can install the development version of ggtranscript from GitHub: # install.packages("devtools") devtools::install_github("dzhang32/ggtranscript") ``` ## Usage `ggtranscript` introduces 5 new geoms (`geom_range()`, `geom_half_range()`, `geom_intron()`, `geom_junction()` and `geom_junction_label_repel()`) and several helper functions designed to facilitate the visualization of transcript structure and annotation. The following guide takes you on a quick tour of using these geoms, for a more detailed overview see the [Getting Started tutorial](https://dzhang32.github.io/ggtranscript/articles/ggtranscript.html). `geom_range()` and `geom_intron()` enable the plotting of exons and introns, the core components of transcript annotation. `ggtranscript` also provides `to_intron()`, which converts exon co-ordinates to the corresponding introns. Together, `ggtranscript` enables users to plot transcript structures with only exons as the required input and just a few lines of code. ```{r geom-range-intron} library(magrittr) library(dplyr) library(ggplot2) library(ggtranscript) # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation %>% head() # extract exons sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") sod1_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(sod1_exons, "transcript_name"), aes(strand = strand) ) ``` `ggtranscript` provides the helper function `shorten_gaps()`, which reduces the size of the gaps. `shorten_gaps()` then rescales the exon and intron co-ordinates to preserve the original exon alignment. This allows you to hone in the differences in the exonic structure, which can be particularly useful if the transcript has relatively long introns. ```{r shorten-gaps} sod1_rescaled <- shorten_gaps( sod1_exons, to_intron(sod1_exons, "transcript_name"), group_var = "transcript_name" ) sod1_rescaled %>% dplyr::filter(type == "exon") %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = sod1_rescaled %>% dplyr::filter(type == "intron"), arrow.min.intron.length = 200 ) ``` `geom_range()` can be used for any range-based genomic annotation. For example, when plotting protein-coding transcripts, users may find it helpful to visually distinguish the coding segments from UTRs. ```{r geom-range-intron-w-cds} # filter for only exons from protein coding transcripts sod1_exons_prot_cod <- sod1_exons %>% dplyr::filter(transcript_biotype == "protein_coding") # obtain cds sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS") sod1_exons_prot_cod %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_cds ) + geom_intron( data = to_intron(sod1_exons_prot_cod, "transcript_name"), aes(strand = strand), arrow.min.intron.length = 500, ) ``` `geom_half_range()` takes advantage of the vertical symmetry of transcript annotation by plotting only half of a range on the top or bottom of a transcript structure. One use case of `geom_half_range()` is to visualize the differences between transcript structure more clearly. ```{r geom-half-range, fig.height = 3} # extract exons and cds for the two transcripts to be compared sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201") sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201") sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202") sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202") sod1_201_202_plot <- sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = "SOD1-201/202" )) + geom_half_range( fill = "white", height = 0.125 ) + geom_half_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_half_range( data = sod1_202_exons, range.orientation = "top", fill = "white", height = 0.125 ) + geom_half_range( data = sod1_202_cds, range.orientation = "top", fill = "purple" ) + geom_intron( data = to_intron(sod1_202_exons, "transcript_name") ) sod1_201_202_plot ``` As a `ggplot2` extension, `ggtranscript` inherits the the familiarity and functionality of `ggplot2`. For instance, by leveraging `coord_cartesian()` users can zoom in on regions of interest. ```{r geom-half-range-zoomed, fig.height = 3} sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000)) ``` `geom_junction()` enables to plotting of junction curves, which can be overlaid across transcript structures. `geom_junction_label_repel()` adds a label to junction curves, which can often be useful to mark junctions with a metric of their usage such as read counts. ```{r geom-junction, fig.height = 3} # ggtranscript includes a set of example (unannotated) junctions # originating from GTEx and downloaded via the Bioconductor package snapcount sod1_junctions # add transcript_name to junctions for plotting sod1_junctions <- sod1_junctions %>% dplyr::mutate(transcript_name = "SOD1-201") sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_junction( data = sod1_junctions, junction.y.max = 0.5 ) + geom_junction_label_repel( data = sod1_junctions, aes(label = round(mean_count, 2)), junction.y.max = 0.5 ) ``` Alternatively, users may prefer to map junction read counts to the thickness of the junction curves. As a `ggplot2` extension, this can be done intuitively by modifying the size `aes()` of `geom_junction()`. In addition, by modifying `ggplot2` scales and themes, users can easily create informative, publication-ready plots. ```{r geom-junction-pub, fig.height = 3} sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_junction( data = sod1_junctions, aes(size = mean_count), junction.y.max = 0.5, ncp = 30, colour = "purple" ) + scale_size_continuous(range = c(0.1, 1), guide = "none") + xlab("Genomic position (chr21)") + ylab("Transcript name") + theme_bw() ``` ## Citation ```{r citing-ggtranscript} citation("ggtranscript") ``` ## Credits * `ggtranscript` was developed using `biocthis`. ================================================ FILE: README.md ================================================ # ggtranscript [![GitHub issues](https://img.shields.io/github/issues/dzhang32/ggtranscript)](https://github.com/dzhang32/ggtranscript/issues) [![GitHub pulls](https://img.shields.io/github/issues-pr/dzhang32/ggtranscript)](https://github.com/dzhang32/ggtranscript/pulls) [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) [![R-CMD-check-bioc](https://github.com/dzhang32/ggtranscript/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/dzhang32/ggtranscript/actions) [![Codecov test coverage](https://codecov.io/gh/dzhang32/ggtranscript/branch/main/graph/badge.svg)](https://app.codecov.io/gh/dzhang32/ggtranscript?branch=main) `ggtranscript` is a `ggplot2` extension that makes it to easy to visualize transcript structure and annotation. ## Installation ``` r # you can install the development version of ggtranscript from GitHub: # install.packages("devtools") devtools::install_github("dzhang32/ggtranscript") ``` ## Usage `ggtranscript` introduces 5 new geoms (`geom_range()`, `geom_half_range()`, `geom_intron()`, `geom_junction()` and `geom_junction_label_repel()`) and several helper functions designed to facilitate the visualization of transcript structure and annotation. The following guide takes you on a quick tour of using these geoms, for a more detailed overview see the [Getting Started tutorial](https://dzhang32.github.io/ggtranscript/articles/ggtranscript.html). `geom_range()` and `geom_intron()` enable the plotting of exons and introns, the core components of transcript annotation. `ggtranscript` also provides `to_intron()`, which converts exon co-ordinates to the corresponding introns. Together, `ggtranscript` enables users to plot transcript structures with only exons as the required input and just a few lines of code. ``` r library(magrittr) library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union library(ggplot2) library(ggtranscript) # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation %>% head() #> # A tibble: 6 × 8 #> seqnames start end strand type gene_name transcript_name #> #> 1 21 31659666 31668931 + gene SOD1 #> 2 21 31659666 31668931 + transcript SOD1 SOD1-202 #> 3 21 31659666 31659784 + exon SOD1 SOD1-202 #> 4 21 31659770 31659784 + CDS SOD1 SOD1-202 #> 5 21 31659770 31659772 + start_codon SOD1 SOD1-202 #> 6 21 31663790 31663886 + exon SOD1 SOD1-202 #> # ℹ 1 more variable: transcript_biotype # extract exons sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") sod1_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(sod1_exons, "transcript_name"), aes(strand = strand) ) ``` `ggtranscript` provides the helper function `shorten_gaps()`, which reduces the size of the gaps. `shorten_gaps()` then rescales the exon and intron co-ordinates to preserve the original exon alignment. This allows you to hone in the differences in the exonic structure, which can be particularly useful if the transcript has relatively long introns. ``` r sod1_rescaled <- shorten_gaps( sod1_exons, to_intron(sod1_exons, "transcript_name"), group_var = "transcript_name" ) sod1_rescaled %>% dplyr::filter(type == "exon") %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = sod1_rescaled %>% dplyr::filter(type == "intron"), arrow.min.intron.length = 200 ) ``` `geom_range()` can be used for any range-based genomic annotation. For example, when plotting protein-coding transcripts, users may find it helpful to visually distinguish the coding segments from UTRs. ``` r # filter for only exons from protein coding transcripts sod1_exons_prot_cod <- sod1_exons %>% dplyr::filter(transcript_biotype == "protein_coding") # obtain cds sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS") sod1_exons_prot_cod %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_cds ) + geom_intron( data = to_intron(sod1_exons_prot_cod, "transcript_name"), aes(strand = strand), arrow.min.intron.length = 500, ) ``` `geom_half_range()` takes advantage of the vertical symmetry of transcript annotation by plotting only half of a range on the top or bottom of a transcript structure. One use case of `geom_half_range()` is to visualize the differences between transcript structure more clearly. ``` r # extract exons and cds for the two transcripts to be compared sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201") sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201") sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202") sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202") sod1_201_202_plot <- sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = "SOD1-201/202" )) + geom_half_range( fill = "white", height = 0.125 ) + geom_half_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_half_range( data = sod1_202_exons, range.orientation = "top", fill = "white", height = 0.125 ) + geom_half_range( data = sod1_202_cds, range.orientation = "top", fill = "purple" ) + geom_intron( data = to_intron(sod1_202_exons, "transcript_name") ) sod1_201_202_plot ``` As a `ggplot2` extension, `ggtranscript` inherits the the familiarity and functionality of `ggplot2`. For instance, by leveraging `coord_cartesian()` users can zoom in on regions of interest. ``` r sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000)) ``` `geom_junction()` enables to plotting of junction curves, which can be overlaid across transcript structures. `geom_junction_label_repel()` adds a label to junction curves, which can often be useful to mark junctions with a metric of their usage such as read counts. ``` r # ggtranscript includes a set of example (unannotated) junctions # originating from GTEx and downloaded via the Bioconductor package snapcount sod1_junctions #> # A tibble: 5 × 5 #> seqnames start end strand mean_count #> #> 1 chr21 31659787 31666448 + 0.463 #> 2 chr21 31659842 31660554 + 0.831 #> 3 chr21 31659842 31663794 + 0.316 #> 4 chr21 31659842 31667257 + 4.35 #> 5 chr21 31660351 31663789 + 0.324 # add transcript_name to junctions for plotting sod1_junctions <- sod1_junctions %>% dplyr::mutate(transcript_name = "SOD1-201") sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_junction( data = sod1_junctions, junction.y.max = 0.5 ) + geom_junction_label_repel( data = sod1_junctions, aes(label = round(mean_count, 2)), junction.y.max = 0.5 ) ``` Alternatively, users may prefer to map junction read counts to the thickness of the junction curves. As a `ggplot2` extension, this can be done intuitively by modifying the size `aes()` of `geom_junction()`. In addition, by modifying `ggplot2` scales and themes, users can easily create informative, publication-ready plots. ``` r sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_junction( data = sod1_junctions, aes(size = mean_count), junction.y.max = 0.5, ncp = 30, colour = "purple" ) + scale_size_continuous(range = c(0.1, 1), guide = "none") + xlab("Genomic position (chr21)") + ylab("Transcript name") + theme_bw() #> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. #> ℹ Please use `linewidth` instead. #> This warning is displayed once every 8 hours. #> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was #> generated. ``` ## Citation ``` r citation("ggtranscript") #> To cite package 'ggtranscript' in publications use: #> #> Gustavsson EK, Zhang D, Reynolds RH, Garcia-Ruiz S, Ryten M (2022). #> "ggtranscript: an R package for the visualization and interpretation #> of transcript isoforms using ggplot2." _Bioinformatics_. #> doi:10.1093/bioinformatics/btac409 #> , #> . #> #> A BibTeX entry for LaTeX users is #> #> @Article{, #> title = {ggtranscript: an R package for the visualization and interpretation of transcript isoforms using ggplot2}, #> author = {Emil K Gustavsson and David Zhang and Regina H Reynolds and Sonia Garcia-Ruiz and Mina Ryten}, #> year = {2022}, #> journal = {Bioinformatics}, #> doi = {https://doi.org/10.1093/bioinformatics/btac409}, #> url = {https://academic.oup.com/bioinformatics/article/38/15/3844/6617821}, #> } ``` ## Credits - `ggtranscript` was developed using `biocthis`. ================================================ FILE: _pkgdown.yml ================================================ template: bootstrap: 5 bootswatch: cosmo ================================================ FILE: codecov.yml ================================================ comment: false coverage: status: project: default: target: auto threshold: 1% informational: true patch: default: target: auto threshold: 1% informational: true ================================================ FILE: data-raw/ggplot2_exts_thumbnail.R ================================================ # Load libraries ---------------------------------------------------------- library(tidyverse) devtools::load_all(".") # Main -------------------------------------------------------------------- sod1_201_exons <- sod1_annotation %>% dplyr::filter( type == "exon", transcript_name == "SOD1-201" ) sod1_201_cds <- sod1_annotation %>% dplyr::filter( type == "CDS", transcript_name == "SOD1-201" ) sod1_junctions <- sod1_junctions %>% dplyr::mutate(transcript_name = "SOD1-201") ggplot2_exts_figure <- sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.125 ) + geom_range( data = sod1_201_cds, height = 0.25 ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_junction( data = sod1_junctions, aes(size = mean_count), junction.y.max = 0.25, ncp = 30, colour = "purple" ) + scale_size_continuous(range = c(0.1, 1), guide = "none") + xlab("Genomic position (chr21)") + ylab("Transcript name") + theme_bw() + theme( axis.line = element_line(colour = "black"), panel.grid = element_blank(), panel.border = element_blank() ) ggplot2_exts_figure # Save data --------------------------------------------------------------- ggsave( plot = ggplot2_exts_figure, filename = here::here("man", "figures", "dzhang32-ggtranscript.png"), height = 3, width = 3.5, dpi = 600 ) ================================================ FILE: data-raw/ggtranscript_logo.R ================================================ # Load libraries ---------------------------------------------------------- library(tidyverse) library(hexSticker) library(showtext) devtools::load_all(".") # Main -------------------------------------------------------------------- logo_exons <- tribble( ~start, ~end, ~tx, ~letter, #-------|-----|----|-------- 150, 200, "J", "T", 500, 550, "J", "T", 300, 310, "I", "T", 350, 400, "I", "T", 300, 350, "H", "T", 390, 400, "H", "T", 300, 310, "G", "T", 350, 400, "G", "T", 700, 800, "J", "X_top", 1100, 1200, "J", "X_top", 700, 800, "G", "X_bot", 1100, 1200, "G", "X_bot" ) %>% dplyr::mutate( tx = tx %>% factor( levels = LETTERS[1:17] ) ) logo_utr <- tribble( ~start, ~end, ~tx, ~letter, #-------|-----|----|-------- 100, 150, "J", "T", 550, 600, "J", "T", ) logo_introns <- logo_exons %>% dplyr::filter(letter == "T") %>% to_intron(group_var = "tx") logo_junctions <- logo_exons %>% dplyr::filter(letter %in% c("X_top", "X_bot")) %>% to_intron(group_var = "tx") size <- 0.3 colour <- "black" fill <- ggpubr::get_palette("jco", 10)[10] # create T ggtranscript_logo <- logo_exons %>% dplyr::filter(letter == "T") %>% ggplot(aes( xstart = start, xend = end, y = tx )) + geom_range( fill = fill, size = size, colour = colour ) + geom_range( data = logo_utr, fill = "white", height = 0.25, size = size, colour = colour ) + geom_intron( data = logo_introns, size = size, colour = colour, arrow.min.intron.length = 100 ) ggtranscript_logo <- ggtranscript_logo + geom_half_range( data = logo_exons %>% dplyr::filter(letter == "X_top"), range.orientation = "top", fill = fill, size = size, colour = colour, ) + geom_half_range( data = logo_exons %>% dplyr::filter(letter == "X_bot"), fill = fill, size = size, colour = colour, ) + geom_junction( data = logo_junctions %>% dplyr::filter(letter == "X_bot"), size = size, colour = colour, junction.orientation = "top", junction.y.max = 1.4, ncp = 50 ) + geom_junction( data = logo_junctions %>% dplyr::filter(letter == "X_top"), size = size, colour = colour, junction.orientation = "bottom", junction.y.max = 1.4, ncp = 50 ) ggtranscript_logo <- ggtranscript_logo + scale_x_continuous( limits = c(-300, 1600), minor_breaks = seq(-300, 1500, 100) ) + scale_y_discrete(drop = FALSE) + theme_bw() + theme( axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), panel.grid.major = element_line( size = size, colour = ggpubr::get_palette("Greys", 10)[2] ), panel.grid.minor = element_line( size = size, colour = ggpubr::get_palette("Greys", 10)[2] ) ) ggtranscript_logo # Save data --------------------------------------------------------------- # use font from https://fonts.google.com font_add_google(name = "Raleway", family = "Raleway") showtext_auto() ggtranscript_logo_hex <- hexSticker::sticker( # the plot (TX) subplot = ggtranscript_logo, s_x = 0.98, s_y = 1.2, s_width = 2.8, s_height = 3, # the package package = "ggtranscript", p_x = 1, p_y = 0.65, p_size = 35, p_family = "Raleway", p_fontface = "bold", p_color = ggpubr::get_palette("jco", 10)[6], # hex border h_color = ggpubr::get_palette("jco", 10)[6], h_fill = "white", h_size = 2, # url url = "https://github.com/dzhang32/ggtranscript", u_family = "Raleway", u_color = ggpubr::get_palette("jco", 10)[6], u_size = 6.5, # general filename = here::here("man", "figures", "ggtranscript_logo.png"), dpi = 600, white_around_sticker = TRUE ) # here::here("man", "figures", "ggtranscript_logo.png") is then # manually cropped to remove the white background in Inkscape plot(ggtranscript_logo_hex) ================================================ FILE: data-raw/sod1_junctions.R ================================================ # Load libraries ---------------------------------------------------------- library(tidyverse) library(snapcount) library(SummarizedExperiment) # Main -------------------------------------------------------------------- # obtain gtex junctions across SOD1 sod1_query <- snapcount::QueryBuilder(compilation = "gtex", regions = "SOD1") # keeping only unannotated junctions # from liver where SOD1 is highly expressed # https://gtexportal.org/home/gene/SOD1 sod1_query <- set_row_filters(sod1_query, annotated == 0) sod1_query <- set_column_filters(sod1_query, SMTS == "Liver") sod1_junctions <- snapcount::query_jx(sod1_query) # obtain mean counts mean_counts <- sod1_junctions %>% SummarizedExperiment::assays() %>% .[["counts"]] %>% as.matrix() %>% rowMeans() sod1_junctions <- sod1_junctions %>% SummarizedExperiment::rowRanges() %>% as.data.frame() %>% dplyr::as_tibble() # minor QC and tidying of the junctions sod1_junctions <- sod1_junctions %>% dplyr::mutate(mean_count = mean_counts) %>% dplyr::filter(mean_count > 0.3) %>% dplyr::select( seqnames, start, end, strand, mean_count ) # Save data --------------------------------------------------------------- usethis::use_data( sod1_junctions, compress = "gzip", overwrite = TRUE ) ================================================ FILE: data-raw/sod1_pknox1_annotation.R ================================================ # Load libraries ---------------------------------------------------------- library(tidyverse) library(rtracklayer) library(R.utils) # Main -------------------------------------------------------------------- gtf_path <- file.path(tempdir(), "Homo_sapiens.GRCh38.105.chr.gtf.gz") # download ens 105 gtf download.file( stringr::str_c( "http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/", "Homo_sapiens.GRCh38.105.chr.gtf.gz" ), destfile = gtf_path ) # unzip gtf R.utils::gunzip(gtf_path) gtf_path <- gtf_path %>% stringr::str_remove("\\.gz$") gtf <- rtracklayer::import(gtf_path) # extract example gene transcripts # convert to tibble() sod1_annotation <- gtf[!is.na(gtf$gene_name) & gtf$gene_name == "SOD1"] %>% as.data.frame() %>% dplyr::as_tibble() %>% dplyr::select( seqnames, start, end, strand, type, gene_name, transcript_name, transcript_biotype ) pknox1_annotation <- gtf[!is.na(gtf$gene_name) & gtf$gene_name == "PKNOX1"] %>% as.data.frame() %>% dplyr::as_tibble() %>% dplyr::select( seqnames, start, end, strand, type, gene_name, transcript_name, transcript_biotype ) # Save data --------------------------------------------------------------- usethis::use_data( sod1_annotation, compress = "gzip", overwrite = TRUE ) usethis::use_data( pknox1_annotation, compress = "gzip", overwrite = TRUE ) ================================================ FILE: inst/CITATION ================================================ pkgVer <- function(pkg) { if (!exists("meta") || is.null(meta)) meta <- packageDescription(pkg) ver <- meta$Version paste0('https://github.com/dzhang32/', pkg, ' - R package version ', ver) } c( bibentry(bibtype="article", title = "ggtranscript: an R package for the visualization and interpretation of transcript isoforms using ggplot2", author = personList( as.person("Emil K Gustavsson"), as.person("David Zhang"), as.person("Regina H Reynolds"), as.person("Sonia Garcia-Ruiz"), as.person("Mina Ryten") ), year = 2022, journal = "Bioinformatics", doi = "https://doi.org/10.1093/bioinformatics/btac409", url = "https://academic.oup.com/bioinformatics/article/38/15/3844/6617821" ) ) ================================================ FILE: man/add_exon_number.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/add_exon_number.R \name{add_exon_number} \alias{add_exon_number} \title{Add exon number} \usage{ add_exon_number(exons, group_var = NULL) } \arguments{ \item{exons}{\code{data.frame()} contains exons which can originate from multiple transcripts differentiated by \code{group_var}.} \item{group_var}{\code{character()} if input data originates from more than 1 transcript, \code{group_var} must specify the column that differentiates transcripts (e.g. "transcript_id").} } \value{ \code{data.frame()} equivalent to input \code{exons}, with the additional column "exon_number". } \description{ \code{add_exon_number()} adds the exon number (the order the exons are transcribed within each transcript) as a column in \code{exons}. This can be useful when visualizing long, complex transcript structures, in order to keep track of specific exons of interest. } \details{ To note, a "strand" column must be present within \code{exons}. The strand is used to differentiate whether exon numbers should be calculated according to ascending ("+") or descending ("-") genomic co-ordinates. For ambiguous strands ("*"), \code{add_exon_number()} will be assume the strand be "+". } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation \%>\% head() # extract exons sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon") sod1_exons \%>\% head() # add the exon number for each transcript sod1_exons <- sod1_exons \%>\% add_exon_number(group_var = "transcript_name") base <- sod1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = to_intron(sod1_exons, "transcript_name"), strand = "+" ) # it can be useful to annotate exons with their exon number # using ggplot2::geom_text() base + geom_text(aes( x = (start + end) / 2, # plot label at midpoint of exon label = exon_number ), size = 3.5, nudge_y = 0.4 ) # Or alternatively, using ggrepel::geom_label_repel() # to separate labels from exons base + ggrepel::geom_label_repel(ggplot2::aes( x = (start + end) / 2, label = exon_number ), size = 3.5, min.segment.length = 0 ) } ================================================ FILE: man/add_utr.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/add_utr.R \name{add_utr} \alias{add_utr} \title{Add untranslated regions (UTRs)} \usage{ add_utr(exons, cds, group_var = NULL) } \arguments{ \item{exons}{\code{data.frame()} contains exons which can originate from multiple transcripts differentiated by \code{group_var}.} \item{cds}{\code{data.frame()} contains coding sequence ranges for the transcripts in \code{exons}.} \item{group_var}{\code{character()} if input data originates from more than 1 transcript, \code{group_var} must specify the column that differentiates transcripts (e.g. "transcript_id").} } \value{ \code{data.frame()} contains differentiated CDS and UTR ranges. } \description{ Given a set of \code{exons} (encompassing the CDS and UTRs) and \code{cds} regions, \code{add_utr()} will calculate and add the corresponding UTR regions as ranges. This can be useful when combined with \code{shorten_gaps()} to visualize transcripts with long introns, whilst differentiating UTRs from CDS regions. } \details{ The definition of the inputted \code{cds} regions are expected to range from the beginning of the start codon to the end of the stop codon. Sometimes, for example in the case of Ensembl, reference annotation will omit the stop codons from the CDS definition. In such cases, users should manually ensure that the \code{cds} includes both the start and stop codons. } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation pknox1_annotation \%>\% head() # extract exons pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon") pknox1_exons \%>\% head() # extract cds pknox1_cds <- pknox1_annotation \%>\% dplyr::filter(type == "CDS") pknox1_cds \%>\% head() # the CDS definition originating from the Ensembl reference annotation # does not include the stop codon # we must incorporate the stop codons into the CDS manually # by adding 3 base pairs to the end of the CDS of each transcript pknox1_cds_w_stop <- pknox1_cds \%>\% dplyr::group_by(transcript_name) \%>\% dplyr::mutate( end = ifelse(end == max(end), end + 3, end) ) \%>\% dplyr::ungroup() # add_utr() adds ranges that represent the UTRs pknox1_cds_utr <- add_utr( pknox1_exons, pknox1_cds_w_stop, group_var = "transcript_name" ) pknox1_cds_utr \%>\% head() # this can be useful when combined with shorten_gaps() # to visualize transcripts with long introns whilst differentiating UTRs pknox1_cds_utr_rescaled <- shorten_gaps( exons = pknox1_cds_utr, introns = to_intron(pknox1_cds_utr, "transcript_name"), group_var = "transcript_name" ) pknox1_cds_utr_rescaled \%>\% dplyr::filter(type == "CDS") \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_range( data = pknox1_cds_utr_rescaled \%>\% dplyr::filter(type == "UTR"), height = 0.25, fill = "white" ) + geom_intron( data = to_intron( pknox1_cds_utr_rescaled \%>\% dplyr::filter(type != "intron"), "transcript_name" ), arrow.min.intron.length = 110 ) } ================================================ FILE: man/geom_intron.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/geom_intron.R \name{geom_intron} \alias{geom_intron} \title{Plot intron lines with strand arrows} \usage{ geom_intron( mapping = NULL, data = NULL, stat = "identity", position = "identity", ..., arrow = grid::arrow(ends = "last", length = grid::unit(0.1, "inches")), arrow.fill = NULL, lineend = "butt", linejoin = "round", na.rm = FALSE, arrow.min.intron.length = 0, show.legend = NA, inherit.aes = TRUE ) } \arguments{ \item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and \code{inherit.aes = TRUE} (the default), it is combined with the default mapping at the top level of the plot. You must supply \code{mapping} if there is no plot mapping.} \item{data}{The data to be displayed in this layer. There are three options: If \code{NULL}, the default, the data is inherited from the plot data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}. A \code{data.frame}, or other object, will override the plot data. All objects will be fortified to produce a data frame. See \code{\link[ggplot2:fortify]{fortify()}} for which variables will be created. A \code{function} will be called with a single argument, the plot data. The return value must be a \code{data.frame}, and will be used as the layer data. A \code{function} can be created from a \code{formula} (e.g. \code{~ head(.x, 10)}).} \item{stat}{The statistical transformation to use on the data for this layer. When using a \verb{geom_*()} function to construct a layer, the \code{stat} argument can be used the override the default coupling between geoms and stats. The \code{stat} argument accepts the following: \itemize{ \item A \code{Stat} ggproto subclass, for example \code{StatCount}. \item A string naming the stat. To give the stat as a string, strip the function name of the \code{stat_} prefix. For example, to use \code{stat_count()}, give the stat as \code{"count"}. \item For more information and other ways to specify the stat, see the \link[ggplot2:layer_stats]{layer stat} documentation. }} \item{position}{A position adjustment to use on the data for this layer. This can be used in various ways, including to prevent overplotting and improving the display. The \code{position} argument accepts the following: \itemize{ \item The result of calling a position function, such as \code{position_jitter()}. This method allows for passing extra arguments to the position. \item A string naming the position adjustment. To give the position as a string, strip the function name of the \code{position_} prefix. For example, to use \code{position_jitter()}, give the position as \code{"jitter"}. \item For more information and other ways to specify the position, see the \link[ggplot2:layer_positions]{layer position} documentation. }} \item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These arguments broadly fall into one of 4 categories below. Notably, further arguments to the \code{position} argument, or aesthetics that are required can \emph{not} be passed through \code{...}. Unknown arguments that are not part of the 4 categories below are ignored. \itemize{ \item Static aesthetics that are not mapped to a scale, but are at a fixed value and apply to the layer as a whole. For example, \code{colour = "red"} or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics} section that lists the available options. The 'required' aesthetics cannot be passed on to the \code{params}. Please note that while passing unmapped aesthetics as vectors is technically possible, the order and required length is not guaranteed to be parallel to the input data. \item When constructing a layer using a \verb{stat_*()} function, the \code{...} argument can be used to pass on parameters to the \code{geom} part of the layer. An example of this is \code{stat_density(geom = "area", outline.type = "both")}. The geom's documentation lists which parameters it can accept. \item Inversely, when constructing a layer using a \verb{geom_*()} function, the \code{...} argument can be used to pass on parameters to the \code{stat} part of the layer. An example of this is \code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation lists which parameters it can accept. \item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through \code{...}. This can be one of the functions described as \link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend. }} \item{arrow}{specification for arrow heads, as created by \code{\link[grid:arrow]{grid::arrow()}}.} \item{arrow.fill}{fill colour to use for the arrow head (if closed). \code{NULL} means use \code{colour} aesthetic.} \item{lineend}{Line end style (round, butt, square).} \item{linejoin}{Line join style (round, mitre, bevel).} \item{na.rm}{If \code{FALSE}, the default, missing values are removed with a warning. If \code{TRUE}, missing values are silently removed.} \item{arrow.min.intron.length}{\code{integer()} the minimum required width of an intron for a strand arrow to be drawn. This can be useful to remove strand arrows on short introns that overlap adjacent exons.} \item{show.legend}{logical. Should this layer be included in the legends? \code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE} never includes, and \code{TRUE} always includes. It can also be a named logical vector to finely select the aesthetics to display.} \item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather than combining with them. This is most useful for helper functions that define both data and aesthetics and shouldn't inherit behaviour from the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.} } \value{ the return value of a \verb{geom_*} function is not intended to be directly handled by users. Therefore, \verb{geom_*} functions should never be executed in isolation, rather used in combination with a \code{ggplot2::ggplot()} call. } \description{ \code{geom_intron()} draws horizontal lines with central arrows that are designed to represent introns. In combination with \code{geom_range()}/\code{geom_half_range()}, these geoms form the core components for visualizing transcript structures. } \details{ \code{geom_intron()} requires the following \code{aes()}; \code{xstart}, \code{xend} and \code{y} (e.g. transcript name). If users do not have intron co-ordinates, these can be generated from the corresponding exons using \code{to_intron()}. The \code{strand} option (one of "+" or "-") adjusts the arrow direction to match the direction of transcription. The \code{arrow.min.intron.length} parameter can be useful to remove strand arrows that overlap exons, which can be a problem if plotted introns include those that are relatively short. } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation pknox1_annotation \%>\% head() # extract exons pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon") pknox1_exons \%>\% head() # to_intron() is a helper function included in ggtranscript # which is useful for converting exon co-ordinates to introns pknox1_introns <- pknox1_exons \%>\% to_intron(group_var = "transcript_name") pknox1_introns \%>\% head() base <- pknox1_introns \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) # by default, geom_intron() assumes introns originate from the "+" strand base + geom_intron() # however this can be modified using the strand option base + geom_intron(strand = "-") # strand can also be set as an aes() base + geom_intron(aes(strand = strand)) # as a ggplot2 extension, ggtranscript geoms inherit the # the functionality from the parameters and aesthetics in ggplot2 base + geom_intron( aes(colour = transcript_name), linewidth = 1 ) # together, geom_range() and geom_intron() are designed to visualize # the core components of transcript annotation pknox1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = pknox1_introns ) # for short introns, sometimes strand arrows will overlap exons # to avoid this, users can set the arrow.min.intron.length parameter pknox1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = pknox1_introns, arrow.min.intron.length = 3500 ) } ================================================ FILE: man/geom_junction.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/geom_junction.R \name{geom_junction} \alias{geom_junction} \title{Plot junction curves} \usage{ geom_junction( mapping = NULL, data = NULL, stat = "identity", position = "identity", junction.orientation = "alternating", junction.y.max = 1, angle = 90, ncp = 15, na.rm = FALSE, orientation = NA, show.legend = NA, inherit.aes = TRUE, ... ) } \arguments{ \item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and \code{inherit.aes = TRUE} (the default), it is combined with the default mapping at the top level of the plot. You must supply \code{mapping} if there is no plot mapping.} \item{data}{The data to be displayed in this layer. There are three options: If \code{NULL}, the default, the data is inherited from the plot data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}. A \code{data.frame}, or other object, will override the plot data. All objects will be fortified to produce a data frame. See \code{\link[ggplot2:fortify]{fortify()}} for which variables will be created. A \code{function} will be called with a single argument, the plot data. The return value must be a \code{data.frame}, and will be used as the layer data. A \code{function} can be created from a \code{formula} (e.g. \code{~ head(.x, 10)}).} \item{stat}{The statistical transformation to use on the data for this layer. When using a \verb{geom_*()} function to construct a layer, the \code{stat} argument can be used the override the default coupling between geoms and stats. The \code{stat} argument accepts the following: \itemize{ \item A \code{Stat} ggproto subclass, for example \code{StatCount}. \item A string naming the stat. To give the stat as a string, strip the function name of the \code{stat_} prefix. For example, to use \code{stat_count()}, give the stat as \code{"count"}. \item For more information and other ways to specify the stat, see the \link[ggplot2:layer_stats]{layer stat} documentation. }} \item{position}{A position adjustment to use on the data for this layer. This can be used in various ways, including to prevent overplotting and improving the display. The \code{position} argument accepts the following: \itemize{ \item The result of calling a position function, such as \code{position_jitter()}. This method allows for passing extra arguments to the position. \item A string naming the position adjustment. To give the position as a string, strip the function name of the \code{position_} prefix. For example, to use \code{position_jitter()}, give the position as \code{"jitter"}. \item For more information and other ways to specify the position, see the \link[ggplot2:layer_positions]{layer position} documentation. }} \item{junction.orientation}{\code{character()} one of "alternating", "top" or "bottom", specifying where the junctions will be plotted with respect to each transcript (\code{y}).} \item{junction.y.max}{\code{double()} the max y-value of each junction curve. It can be useful to adjust this parameter when junction curves overlap with one another/other transcripts or extend beyond the plot margins.} \item{angle}{A numeric value between 0 and 180, giving an amount to skew the control points of the curve. Values less than 90 skew the curve towards the start point and values greater than 90 skew the curve towards the end point.} \item{ncp}{The number of control points used to draw the curve. More control points creates a smoother curve.} \item{na.rm}{If \code{FALSE}, the default, missing values are removed with a warning. If \code{TRUE}, missing values are silently removed.} \item{orientation}{The orientation of the layer. The default (\code{NA}) automatically determines the orientation from the aesthetic mapping. In the rare event that this fails it can be given explicitly by setting \code{orientation} to either \code{"x"} or \code{"y"}. See the \emph{Orientation} section for more detail.} \item{show.legend}{logical. Should this layer be included in the legends? \code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE} never includes, and \code{TRUE} always includes. It can also be a named logical vector to finely select the aesthetics to display.} \item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather than combining with them. This is most useful for helper functions that define both data and aesthetics and shouldn't inherit behaviour from the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.} \item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These arguments broadly fall into one of 4 categories below. Notably, further arguments to the \code{position} argument, or aesthetics that are required can \emph{not} be passed through \code{...}. Unknown arguments that are not part of the 4 categories below are ignored. \itemize{ \item Static aesthetics that are not mapped to a scale, but are at a fixed value and apply to the layer as a whole. For example, \code{colour = "red"} or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics} section that lists the available options. The 'required' aesthetics cannot be passed on to the \code{params}. Please note that while passing unmapped aesthetics as vectors is technically possible, the order and required length is not guaranteed to be parallel to the input data. \item When constructing a layer using a \verb{stat_*()} function, the \code{...} argument can be used to pass on parameters to the \code{geom} part of the layer. An example of this is \code{stat_density(geom = "area", outline.type = "both")}. The geom's documentation lists which parameters it can accept. \item Inversely, when constructing a layer using a \verb{geom_*()} function, the \code{...} argument can be used to pass on parameters to the \code{stat} part of the layer. An example of this is \code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation lists which parameters it can accept. \item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through \code{...}. This can be one of the functions described as \link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend. }} } \value{ the return value of a \verb{geom_*} function is not intended to be directly handled by users. Therefore, \verb{geom_*} functions should never be executed in isolation, rather used in combination with a \code{ggplot2::ggplot()} call. } \description{ \code{geom_junction()} draws curves that are designed to represent junction reads from RNA-sequencing data. It can be useful to overlay junction data on transcript annotation (plotted using \code{geom_range()}/\code{geom_half_range()} and \code{geom_intron()}) to understand which splicing events or transcripts have support from RNA-sequencing data. } \details{ \code{geom_junction()} requires the following \code{aes()}; \code{xstart}, \code{xend} and \code{y} (e.g. transcript name). \code{geom_junction()} curves can be modified using \code{junction.y.max}, which can be useful when junctions overlap one another/other transcripts or extend beyond the plot margins. By default, junction curves will alternate between being plotted on the top and bottom of each transcript (\code{y}), however this can be modified via \code{junction.orientation}. } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation \%>\% head() # as well as a set of example (unannotated) junctions # originating from GTEx and downloaded via the Bioconductor package snapcount sod1_junctions # extract exons sod1_exons <- sod1_annotation \%>\% dplyr::filter( type == "exon", transcript_name == "SOD1-201" ) sod1_exons \%>\% head() # add transcript_name to junctions for plotting sod1_junctions <- sod1_junctions \%>\% dplyr::mutate(transcript_name = "SOD1-201") # junctions can be plotted as curves using geom_junction() base <- sod1_junctions \%>\% ggplot2::ggplot(ggplot2::aes( xstart = start, xend = end, y = transcript_name )) # sometimes, depending on the number and widths of transcripts and junctions # junctions will go overlap one another or extend beyond the plot margin base + geom_junction() # in such cases, junction.y.max can be adjusted to modify the max y of curves base + geom_junction(junction.y.max = 0.5) # ncp can be used improve the smoothness of curves base + geom_junction(junction.y.max = 0.5, ncp = 30) # junction.orientation controls where the junction are plotted # with respect to each transcript # either alternating (default), or on the top or bottom base + geom_junction(junction.orientation = "top", junction.y.max = 0.5) base + geom_junction(junction.orientation = "bottom", junction.y.max = 0.5) # it can be useful useful to overlay junction curves onto existing annotation # plotted using geom_range() and geom_intron() base <- sod1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = to_intron(sod1_exons, "transcript_name") ) base + geom_junction( data = sod1_junctions, junction.y.max = 0.5 ) # as a ggplot2 extension, ggtranscript geoms inherit the # the functionality from the parameters and aesthetics in ggplot2 # this can be useful when mapping junction thickness to their counts base + geom_junction( data = sod1_junctions, aes(linewidth = mean_count), junction.y.max = 0.5, colour = "purple" ) + scale_linewidth(range = c(0.1, 1)) # it can be useful to combine geom_junction() with geom_half_range() sod1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_half_range() + geom_intron( data = to_intron(sod1_exons, "transcript_name") ) + geom_junction( data = sod1_junctions, aes(linewidth = mean_count), junction.y.max = 0.5, junction.orientation = "top", colour = "purple" ) + scale_linewidth(range = c(0.1, 1)) } ================================================ FILE: man/geom_junction_label_repel.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/geom_junction_label_repel.R \name{geom_junction_label_repel} \alias{geom_junction_label_repel} \title{Label junction curves} \usage{ geom_junction_label_repel( mapping = NULL, data = NULL, stat = "identity", position = "identity", parse = FALSE, ..., junction.orientation = "alternating", junction.y.max = 1, angle = 90, ncp = 15, box.padding = 0.25, label.padding = 0.25, point.padding = 1e-06, label.r = 0.15, label.size = 0.25, min.segment.length = 0, arrow = NULL, force = 1, force_pull = 1, max.time = 0.5, max.iter = 10000, max.overlaps = getOption("ggrepel.max.overlaps", default = 10), nudge_x = 0, nudge_y = 0, xlim = c(NA, NA), ylim = c(NA, NA), na.rm = FALSE, show.legend = NA, direction = c("both", "y", "x"), seed = NA, verbose = FALSE, inherit.aes = TRUE ) } \arguments{ \item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2]{aes}} or \code{\link[ggplot2]{aes_}}. If specified and \code{inherit.aes = TRUE} (the default), is combined with the default mapping at the top level of the plot. You only need to supply \code{mapping} if there isn't a mapping defined for the plot.} \item{data}{A data frame. If specified, overrides the default data frame defined at the top level of the plot.} \item{stat}{The statistical transformation to use on the data for this layer, as a string.} \item{position}{Position adjustment, either as a string, or the result of a call to a position adjustment function.} \item{parse}{If TRUE, the labels will be parsed into expressions and displayed as described in ?plotmath} \item{...}{other arguments passed on to \code{\link[ggplot2]{layer}}. There are three types of arguments you can use here: \itemize{ \item Aesthetics: to set an aesthetic to a fixed value, like \code{colour = "red"} or \code{size = 3}. \item Other arguments to the layer, for example you override the default \code{stat} associated with the layer. \item Other arguments passed on to the stat. }} \item{junction.orientation}{\code{character()} one of "alternating", "top" or "bottom", specifying where the junctions will be plotted with respect to each transcript (\code{y}).} \item{junction.y.max}{\code{double()} the max y-value of each junction curve. It can be useful to adjust this parameter when junction curves overlap with one another/other transcripts or extend beyond the plot margins.} \item{angle}{A numeric value between 0 and 180, giving an amount to skew the control points of the curve. Values less than 90 skew the curve towards the start point and values greater than 90 skew the curve towards the end point.} \item{ncp}{The number of control points used to draw the curve. More control points creates a smoother curve.} \item{box.padding}{Amount of padding around bounding box, as unit or number. Defaults to 0.25. (Default unit is lines, but other units can be specified by passing \code{unit(x, "units")}).} \item{label.padding}{Amount of padding around label, as unit or number. Defaults to 0.25. (Default unit is lines, but other units can be specified by passing \code{unit(x, "units")}).} \item{point.padding}{Amount of padding around labeled point, as unit or number. Defaults to 0. (Default unit is lines, but other units can be specified by passing \code{unit(x, "units")}).} \item{label.r}{Radius of rounded corners, as unit or number. Defaults to 0.15. (Default unit is lines, but other units can be specified by passing \code{unit(x, "units")}).} \item{label.size}{Size of label border, in mm.} \item{min.segment.length}{Skip drawing segments shorter than this, as unit or number. Defaults to 0.5. (Default unit is lines, but other units can be specified by passing \code{unit(x, "units")}).} \item{arrow}{specification for arrow heads, as created by \code{\link[grid]{arrow}}} \item{force}{Force of repulsion between overlapping text labels. Defaults to 1.} \item{force_pull}{Force of attraction between a text label and its corresponding data point. Defaults to 1.} \item{max.time}{Maximum number of seconds to try to resolve overlaps. Defaults to 0.5.} \item{max.iter}{Maximum number of iterations to try to resolve overlaps. Defaults to 10000.} \item{max.overlaps}{Exclude text labels when they overlap too many other things. For each text label, we count how many other text labels or other data points it overlaps, and exclude the text label if it has too many overlaps. Defaults to 10.} \item{nudge_x, nudge_y}{Horizontal and vertical adjustments to nudge the starting position of each text label. The units for \code{nudge_x} and \code{nudge_y} are the same as for the data units on the x-axis and y-axis.} \item{xlim, ylim}{Limits for the x and y axes. Text labels will be constrained to these limits. By default, text labels are constrained to the entire plot area.} \item{na.rm}{If \code{FALSE} (the default), removes missing values with a warning. If \code{TRUE} silently removes missing values.} \item{show.legend}{logical. Should this layer be included in the legends? \code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE} never includes, and \code{TRUE} always includes.} \item{direction}{"both", "x", or "y" -- direction in which to adjust position of labels} \item{seed}{Random seed passed to \code{\link[base]{set.seed}}. Defaults to \code{NA}, which means that \code{set.seed} will not be called.} \item{verbose}{If \code{TRUE}, some diagnostics of the repel algorithm are printed} \item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather than combining with them. This is most useful for helper functions that define both data and aesthetics and shouldn't inherit behaviour from the default plot specification, e.g. \code{\link[ggplot2]{borders}}.} } \value{ the return value of a \verb{geom_*} function is not intended to be directly handled by users. Therefore, \verb{geom_*} functions should never be executed in isolation, rather used in combination with a \code{ggplot2::ggplot()} call. } \description{ \code{geom_junction_label_repel()} labels junction curves at their midpoint using \code{ggrepel::geom_label_repel()}. This can be useful to label and compare junctions (plotted using \code{geom_junction()}) with metrics of their usage (e.g. read counts or percent-spliced-in). } \details{ \code{geom_junction_label_repel()} requires the following \code{aes()}; \code{xstart}, \code{xend}, \code{y} (e.g. transcript name) and \code{label}. Under the hood, \code{geom_junction_label_repel()} generates the same junction curves as \code{geom_junction()} to obtain curve midpoints for labeling. Therefore, it is important that users use the same input data and parameters that alter junction curves (namely \code{junction.orientation}, \code{junction.y.max}, \code{angle}, \code{ncp}) for \code{geom_junction_label_repel()} that they have used for \code{geom_junction()}. } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation \%>\% head() # as well as a set of example (unannotated) junctions # originating from GTEx and downloaded via the Bioconductor package snapcount sod1_junctions # extract exons sod1_exons <- sod1_annotation \%>\% dplyr::filter( type == "exon", transcript_name == "SOD1-201" ) sod1_exons \%>\% head() # add transcript_name to junctions for plotting sod1_junctions <- sod1_junctions \%>\% dplyr::mutate(transcript_name = "SOD1-201") # geom_junction_label_repel() can be used to label junctions base <- sod1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = to_intron(sod1_exons, "transcript_name") ) # this can be useful to label junctions with their counts base + geom_junction( data = sod1_junctions, junction.y.max = 0.5 ) + geom_junction_label_repel( data = sod1_junctions, aes(label = round(mean_count, 2)), junction.y.max = 0.5 ) } ================================================ FILE: man/geom_range.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/geom_range.R, R/geom_half_range.R \name{geom_range} \alias{geom_range} \alias{geom_half_range} \title{Plot genomic ranges} \usage{ geom_range( mapping = NULL, data = NULL, stat = "identity", position = "identity", ..., vjust = NULL, linejoin = "mitre", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE ) geom_half_range( mapping = NULL, data = NULL, stat = "identity", position = "identity", ..., range.orientation = "bottom", linejoin = "mitre", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE ) } \arguments{ \item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and \code{inherit.aes = TRUE} (the default), it is combined with the default mapping at the top level of the plot. You must supply \code{mapping} if there is no plot mapping.} \item{data}{The data to be displayed in this layer. There are three options: If \code{NULL}, the default, the data is inherited from the plot data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}. A \code{data.frame}, or other object, will override the plot data. All objects will be fortified to produce a data frame. See \code{\link[ggplot2:fortify]{fortify()}} for which variables will be created. A \code{function} will be called with a single argument, the plot data. The return value must be a \code{data.frame}, and will be used as the layer data. A \code{function} can be created from a \code{formula} (e.g. \code{~ head(.x, 10)}).} \item{stat}{The statistical transformation to use on the data for this layer. When using a \verb{geom_*()} function to construct a layer, the \code{stat} argument can be used the override the default coupling between geoms and stats. The \code{stat} argument accepts the following: \itemize{ \item A \code{Stat} ggproto subclass, for example \code{StatCount}. \item A string naming the stat. To give the stat as a string, strip the function name of the \code{stat_} prefix. For example, to use \code{stat_count()}, give the stat as \code{"count"}. \item For more information and other ways to specify the stat, see the \link[ggplot2:layer_stats]{layer stat} documentation. }} \item{position}{A position adjustment to use on the data for this layer. This can be used in various ways, including to prevent overplotting and improving the display. The \code{position} argument accepts the following: \itemize{ \item The result of calling a position function, such as \code{position_jitter()}. This method allows for passing extra arguments to the position. \item A string naming the position adjustment. To give the position as a string, strip the function name of the \code{position_} prefix. For example, to use \code{position_jitter()}, give the position as \code{"jitter"}. \item For more information and other ways to specify the position, see the \link[ggplot2:layer_positions]{layer position} documentation. }} \item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These arguments broadly fall into one of 4 categories below. Notably, further arguments to the \code{position} argument, or aesthetics that are required can \emph{not} be passed through \code{...}. Unknown arguments that are not part of the 4 categories below are ignored. \itemize{ \item Static aesthetics that are not mapped to a scale, but are at a fixed value and apply to the layer as a whole. For example, \code{colour = "red"} or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics} section that lists the available options. The 'required' aesthetics cannot be passed on to the \code{params}. Please note that while passing unmapped aesthetics as vectors is technically possible, the order and required length is not guaranteed to be parallel to the input data. \item When constructing a layer using a \verb{stat_*()} function, the \code{...} argument can be used to pass on parameters to the \code{geom} part of the layer. An example of this is \code{stat_density(geom = "area", outline.type = "both")}. The geom's documentation lists which parameters it can accept. \item Inversely, when constructing a layer using a \verb{geom_*()} function, the \code{...} argument can be used to pass on parameters to the \code{stat} part of the layer. An example of this is \code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation lists which parameters it can accept. \item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through \code{...}. This can be one of the functions described as \link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend. }} \item{vjust}{A numeric vector specifying vertical justification. If specified, overrides the \code{just} setting.} \item{linejoin}{Line join style (round, mitre, bevel).} \item{na.rm}{If \code{FALSE}, the default, missing values are removed with a warning. If \code{TRUE}, missing values are silently removed.} \item{show.legend}{logical. Should this layer be included in the legends? \code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE} never includes, and \code{TRUE} always includes. It can also be a named logical vector to finely select the aesthetics to display.} \item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather than combining with them. This is most useful for helper functions that define both data and aesthetics and shouldn't inherit behaviour from the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.} \item{range.orientation}{\code{character()} one of "top" or "bottom", specifying where the half ranges will be plotted with respect to each transcript (\code{y}).} } \value{ the return value of a \verb{geom_*} function is not intended to be directly handled by users. Therefore, \verb{geom_*} functions should never be executed in isolation, rather used in combination with a \code{ggplot2::ggplot()} call. } \description{ \code{geom_range()} and \code{geom_half_range()} draw tiles that are designed to represent range-based genomic features, such as exons. In combination with \code{geom_intron()}, these geoms form the core components for visualizing transcript structures. } \details{ \code{geom_range()} and \code{geom_half_range()} require the following \code{aes()}; \code{xstart}, \code{xend} and \code{y} (e.g. transcript name). \code{geom_half_range()} takes advantage of the vertical symmetry of transcript annotation by plotting only half of a range on the top or bottom of a transcript structure. This can be useful for comparing between two transcripts or free up plotting space for other transcript annotations (e.g. \code{geom_junction()}). } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation \%>\% head() # extract exons sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon") sod1_exons \%>\% head() base <- sod1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) # geom_range() is designed to visualise range-based annotation such as exons base + geom_range() # geom_half_range() allows users to plot half ranges # on the top or bottom of the transcript base + geom_half_range() # where the half ranges are plotted can be adjusted using range.orientation base + geom_half_range(range.orientation = "top") # as a ggplot2 extension, ggtranscript geoms inherit the # the functionality from the parameters and aesthetics in ggplot2 base + geom_range( aes(fill = transcript_name), linewidth = 1 ) # together, geom_range() and geom_intron() are designed to visualize # the core components of transcript annotation base + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(sod1_exons, "transcript_name") ) # for protein coding transcripts # geom_range() be useful for visualizing UTRs that lie outside of the CDS sod1_exons_prot_coding <- sod1_exons \%>\% dplyr::filter(transcript_biotype == "protein_coding") # extract cds sod1_cds <- sod1_annotation \%>\% dplyr::filter(type == "CDS") sod1_exons_prot_coding \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_cds ) + geom_intron( data = to_intron(sod1_exons_prot_coding, "transcript_name") ) # geom_half_range() can be useful for comparing between two transcripts # enabling visualization of one transcript on the top, other on the bottom sod1_201_exons <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-201") sod1_201_cds <- sod1_cds \%>\% dplyr::filter(transcript_name == "SOD1-201") sod1_202_exons <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-202") sod1_202_cds <- sod1_cds \%>\% dplyr::filter(transcript_name == "SOD1-202") sod1_201_plot <- sod1_201_exons \%>\% ggplot(aes( xstart = start, xend = end, y = "SOD1-201/202" )) + geom_half_range( fill = "white", height = 0.125 ) + geom_half_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) sod1_201_plot sod1_201_202_plot <- sod1_201_plot + geom_half_range( data = sod1_202_exons, range.orientation = "top", fill = "white", height = 0.125 ) + geom_half_range( data = sod1_202_cds, range.orientation = "top", fill = "purple" ) + geom_intron( data = to_intron(sod1_202_exons, "transcript_name") ) sod1_201_202_plot # leveraging existing ggplot2 functionality via e.g. coord_cartesian() # can be useful to zoom in on areas of interest sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000)) } ================================================ FILE: man/ggtranscript.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ggtranscript-package.R \docType{package} \name{ggtranscript} \alias{ggtranscript-package} \alias{ggtranscript} \title{\code{ggtranscript}: Visualizing transcript structure and annotation using \code{ggplot2}} \description{ The goal of \code{ggtranscript} is the simplify the process of visualizing transcript structure and annotation. To achieve this, \code{ggtranscript} introduces 5 new geoms (\code{geom_range()}, \code{geom_half_range()}, \code{geom_intron()}, \code{geom_junction()} and \code{geom_junction_label_repel()}) as well as several helper functions. As a \code{ggplot2} extension, \code{ggtranscript} inherits \code{ggplot2}'s familiarity and flexibility, enabling users to intuitively adjust aesthetics, parameters, scales etc as well as complement \code{ggtranscript} geoms with existing \code{ggplot2} geoms to create informative, publication-ready plots. } \seealso{ Useful links: \itemize{ \item \url{https://github.com/dzhang32/ggtranscript} \item Report bugs at \url{https://github.com/dzhang32/ggtranscript/issues} } } \author{ \strong{Maintainer}: David Zhang \email{dyzhang32@gmail.com} (\href{https://orcid.org/0000-0003-2382-8460}{ORCID}) Authors: \itemize{ \item Emil Gustavsson \email{e.gustavsson@ucl.ac.uk} (\href{https://orcid.org/0000-0003-0541-7537}{ORCID}) } Other contributors: \itemize{ \item Regina Reynolds \email{regina.reynolds.16@ucl.ac.uk} (\href{https://orcid.org/0000-0001-6470-7919}{ORCID}) [contributor] \item Sonia Ruiz \email{s.ruiz@ucl.ac.uk} [contributor] } } ================================================ FILE: man/shorten_gaps.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/shorten_gaps.R \name{shorten_gaps} \alias{shorten_gaps} \title{Improve transcript structure visualization by shortening gaps} \usage{ shorten_gaps(exons, introns, group_var = NULL, target_gap_width = 100L) } \arguments{ \item{exons}{\code{data.frame()} contains exons which can originate from multiple transcripts differentiated by \code{group_var}.} \item{introns}{\code{data.frame()} the intron co-ordinates corresponding to the input \code{exons}. This can be created by applying \code{to_intron()} to the \code{exons}. If introns originate from multiple transcripts, they must be differentiated using \code{group_var}. If a user is not using \code{to_intron()}, they must make sure intron start/ends are defined precisely as the adjacent exon boundaries (rather than exon end + 1 and exon start - 1).} \item{group_var}{\code{character()} if input data originates from more than 1 transcript, \code{group_var} must specify the column that differentiates transcripts (e.g. "transcript_id").} \item{target_gap_width}{\code{integer()} the width in base pairs to shorten the gaps to.} } \value{ \code{data.frame()} contains the re-scaled co-ordinates of \code{introns} and \code{exons} of each input transcript with shortened gaps. } \description{ For a given set of exons and introns, \code{shorten_gaps()} reduces the width of gaps (regions that do not overlap any \code{exons}) to a user-inputted \code{target_gap_width}. This can be useful when visualizing transcripts that have long introns, to hone in on the regions of interest (i.e. exons) and better compare between transcript structures. } \details{ After \code{shorten_gaps()} reduces the size of gaps, it will re-scale \code{exons} and \code{introns} to preserve exon alignment. This process will only reduce the width of input \code{introns}, never \code{exons}. Importantly, the outputted re-scaled co-ordinates should only be used for visualization as they will not match the original genomic coordinates. } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation pknox1_annotation \%>\% head() # extract exons pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon") pknox1_exons \%>\% head() # to_intron() is a helper function included in ggtranscript # which is useful for converting exon co-ordinates to introns pknox1_introns <- pknox1_exons \%>\% to_intron(group_var = "transcript_name") pknox1_introns \%>\% head() # for transcripts with long introns, the exons of interest # can be difficult to visualize clearly when using the default scale pknox1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = pknox1_introns, arrow.min.intron.length = 3500 ) # in such cases it can be useful to rescale the exons and introns # using shorten_gaps() which shortens regions that do not overlap an exon pknox1_rescaled <- shorten_gaps(pknox1_exons, pknox1_introns, group_var = "transcript_name") pknox1_rescaled \%>\% head() # this allows us to visualize differences in exonic structure more clearly pknox1_rescaled \%>\% dplyr::filter(type == "exon") \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = pknox1_rescaled \%>\% dplyr::filter(type == "intron"), arrow.min.intron.length = 300 ) # shorten_gaps() can be used in combination with to_diff() # to further highlight differences in exon structure # here, all other transcripts are compared to the MANE-select transcript pknox1_rescaled_diffs <- to_diff( exons = pknox1_rescaled \%>\% dplyr::filter(type == "exon", transcript_name != "PKNOX1-201"), ref_exons = pknox1_rescaled \%>\% dplyr::filter(type == "exon", transcript_name == "PKNOX1-201"), group_var = "transcript_name" ) pknox1_rescaled \%>\% dplyr::filter(type == "exon") \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = pknox1_rescaled \%>\% dplyr::filter(type == "intron"), arrow.min.intron.length = 300 ) + geom_range( data = pknox1_rescaled_diffs, aes(fill = diff_type), alpha = 0.2 ) } ================================================ FILE: man/sod1_annotation.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{sod1_annotation} \alias{sod1_annotation} \alias{pknox1_annotation} \title{Example transcript annotation} \format{ A \code{tibble::tibble()}: \describe{ \item{seqnames}{\code{factor()} chromosome.} \item{start}{\code{integer()} start position.} \item{end}{\code{integer()} end position.} \item{strand}{\code{factor()} strand.} \item{type}{\code{factor()} E.g.gene, transcript, exon or CDS.} \item{gene_name}{\code{character()} name of gene (GBA).} \item{transcript_name}{\code{character()} name of transcript.} \item{transcript_biotype}{\code{character()} biotype of transcript.} } An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 112 rows and 8 columns. } \source{ generated using \code{ggtranscript/data-raw/sod1_pknox1_annotation.R} } \usage{ sod1_annotation pknox1_annotation } \description{ Transcript annotation including the co-ordinates (hg38) of the genes, transcripts, exons and CDS regions for \emph{SOD1} and \emph{PKNOX1}, which originate from version 105 of the Ensembl reference annotation. } \keyword{datasets} ================================================ FILE: man/sod1_junctions.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{sod1_junctions} \alias{sod1_junctions} \title{Example junctions} \format{ A \code{tibble::tibble()}: \describe{ \item{seqnames}{\code{factor()} chromosome.} \item{start}{\code{integer()} start position.} \item{end}{\code{integer()} end position.} \item{strand}{\code{factor()} strand.} \item{mean_count}{\code{factor()} Average count across all GTEx liver samples.} } } \source{ generated using \code{ggtranscript/data-raw/sod1_junctions.R} } \usage{ sod1_junctions } \description{ Junction co-ordinates and counts associated with the \emph{SOD1} gene. Junctions counts originate from GTEx liver samples and are downloaded via the Bioconductor package \code{snapcount}. Only unannotated junctions with a mean count above 0.3 have been retained for this example. } \keyword{datasets} ================================================ FILE: man/to_diff.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/to_diff.R \name{to_diff} \alias{to_diff} \title{Obtain the differences between transcript structure} \usage{ to_diff(exons, ref_exons, group_var = NULL) } \arguments{ \item{exons}{\code{data.frame()} contains exons which can originate from multiple transcripts differentiated by \code{group_var}.} \item{ref_exons}{\code{data.frame()} contains exons that originate from a single transcript, which \code{exons} will be compared against.} \item{group_var}{\code{character()} if input data originates from more than 1 transcript, \code{group_var} must specify the column that differentiates transcripts (e.g. "transcript_id").} } \value{ \code{data.frame()} details the differences between \code{exons} and \code{ref_exons}. } \description{ \code{to_diff()} obtains the difference between \code{exons} from a set of transcripts to a reference transcript (\code{ref_exons}). This can be useful when visualizing the differences between transcript structure. \code{to_diff()} expects two sets of input exons; 1. \code{exons} - exons from any number of transcripts that will be compared to \code{ref_exons} and 2. \code{ref_exons} - exons from a single transcript which acts as the reference to compare against. } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation \%>\% head() # extract exons sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon") sod1_exons \%>\% head() # for this example, let's compare transcripts to the MANE-select transcript sod1_mane <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-201") sod1_not_mane <- sod1_exons \%>\% dplyr::filter(transcript_name != "SOD1-201") # to_diff() obtains the differences between the exons as ranges sod1_diffs <- to_diff( exons = sod1_not_mane, ref_exons = sod1_mane, group_var = "transcript_name" ) sod1_diffs \%>\% head() # using geom_range(), it can be useful to visually overlay # the differences on top of the transcript annotation sod1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = to_intron(sod1_exons, "transcript_name") ) + geom_range( data = sod1_diffs, ggplot2::aes(fill = diff_type), alpha = 0.2 ) } ================================================ FILE: man/to_intron.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/to_intron.R \name{to_intron} \alias{to_intron} \title{Convert exon co-ordinates to introns} \usage{ to_intron(exons, group_var = NULL) } \arguments{ \item{exons}{\code{data.frame()} contains exons which can originate from multiple transcripts differentiated by \code{group_var}.} \item{group_var}{\code{character()} if input data originates from more than 1 transcript, \code{group_var} must specify the column that differentiates transcripts (e.g. "transcript_id").} } \value{ \code{data.frame()} contains the intron co-ordinates. } \description{ Given a set of \code{exons}, \code{to_intron()} will return the corresponding introns. } \details{ It is important to note that, for visualization purposes, \code{to_intron()} defines introns precisely as the exon boundaries, rather than the intron start/end being (exon end + 1)/(exon start - 1). } \examples{ library(magrittr) library(ggplot2) # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation \%>\% head() # extract exons sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon") sod1_exons \%>\% head() # to_intron() is a helper function included in ggtranscript # which is useful for converting exon co-ordinates to introns sod1_introns <- sod1_exons \%>\% to_intron(group_var = "transcript_name") sod1_introns \%>\% head() # this can be particular useful when combined with # geom_range() and geom_intron() # to visualize the core components of transcript annotation sod1_exons \%>\% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = to_intron(sod1_exons, "transcript_name") ) } ================================================ FILE: tests/testthat/test-add_exon_number.R ================================================ sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") # create dummy transcripts with both positive and minus strand # purely for testing strand functionality test_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202") %>% dplyr::mutate(strand = "-") %>% dplyr::bind_rows( sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201") ) ##### add_exon_number ##### testthat::test_that("add_exon_number() works correctly", { test_exon_number <- test_exons %>% add_exon_number(group_var = "transcript_name") test_exon_number_plus <- test_exon_number %>% dplyr::filter(strand == "+") test_exon_number_minus <- test_exon_number %>% dplyr::filter(strand == "-") expect_true("exon_number" %in% colnames(test_exon_number)) expect_true(is.numeric(test_exon_number[["exon_number"]])) expect_equal( test_exon_number_plus[["exon_number"]], seq_len(nrow(test_exon_number_plus)) ) expect_equal( test_exon_number_minus[["exon_number"]], seq_len(nrow(test_exon_number_minus)) %>% rev() ) # check order makes no difference set.seed(32) expect_equal( test_exons[sample(seq_len(nrow(test_exons)), nrow(test_exons)), ] %>% add_exon_number(group_var = "transcript_name"), test_exon_number ) }) testthat::test_that("add_exon_number(group_var = NULL) works correctly", { test_exon_number_plus <- test_exons %>% dplyr::filter(strand == "+") %>% add_exon_number(group_var = NULL) test_exon_number_minus <- test_exons %>% dplyr::filter(strand == "-") %>% add_exon_number(group_var = NULL) expect_equal( test_exon_number_plus[["exon_number"]], seq_len(nrow(test_exon_number_plus)) ) expect_equal( test_exon_number_minus[["exon_number"]], seq_len(nrow(test_exon_number_minus)) %>% rev() ) }) ================================================ FILE: tests/testthat/test-add_utr.R ================================================ pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") pknox1_cds <- pknox1_annotation %>% dplyr::filter(type == "CDS") pknox1_utr <- pknox1_annotation %>% dplyr::filter(grepl("utr", type)) ##### add_utr ##### # add 3 bp to end of cds as stop codon not included in ensembl cds pknox1_cds_w_stop <- pknox1_cds %>% dplyr::group_by(transcript_name) %>% dplyr::mutate( end = ifelse(end == max(end), end + 3, end) ) %>% dplyr::ungroup() pknox1_cds_utr <- add_utr( pknox1_exons, pknox1_cds_w_stop, group_var = "transcript_name" ) pknox1_cds_utr_1_tx <- add_utr( pknox1_exons %>% dplyr::filter(transcript_name == "PKNOX1-203"), pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203"), group_var = "transcript_name" ) pknox1_cds_utr_1_tx_no_group <- add_utr( pknox1_exons %>% dplyr::filter(transcript_name == "PKNOX1-203"), pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203"), group_var = NULL ) test_add_utrs <- function(cds_utr_add_utr, utr_annotation, cds_annotation) { utr_add_utr <- cds_utr_add_utr %>% dplyr::filter(type == "UTR") %>% dplyr::select(start, end) %>% dplyr::arrange(start, end) cds_add_utr <- cds_utr_add_utr %>% dplyr::filter(type == "CDS") %>% dplyr::select(start, end) %>% dplyr::arrange(start, end) utr_annotation <- utr_annotation %>% dplyr::select(start, end) %>% dplyr::arrange(start, end) cds_annotation <- cds_annotation %>% dplyr::select(start, end) %>% dplyr::arrange(start, end) no_na_type <- all(!is.na(cds_utr_add_utr[["type"]])) no_dummy_group <- is.null(cds_utr_add_utr[["dummy_group"]]) correct_utrs <- all.equal(utr_add_utr, utr_annotation) correct_cds <- all.equal(cds_add_utr, cds_annotation) check_add_utr <- all(no_na_type, no_dummy_group, correct_utrs, correct_cds) return(check_add_utr) } testthat::test_that( "add_utr() works correctly", { expect_true(test_add_utrs(pknox1_cds_utr, pknox1_utr, pknox1_cds_w_stop)) expect_true(test_add_utrs( pknox1_cds_utr_1_tx, pknox1_utr %>% dplyr::filter(transcript_name == "PKNOX1-203"), pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203") )) expect_true(test_add_utrs( pknox1_cds_utr_1_tx_no_group, pknox1_utr %>% dplyr::filter(transcript_name == "PKNOX1-203"), pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203") )) } ) ##### add_utr & shorten_gaps ##### pknox1_cds_utr_rescaled <- shorten_gaps( exons = pknox1_cds_utr, introns = to_intron(pknox1_cds_utr, "transcript_name"), group_var = "transcript_name" ) # add labels helps manual checking plot_before_after_rescaled <- function(cds_utr_before, cds_utr_after, group_var, add_labels = FALSE) { before_rescaling <- cds_utr_before %>% dplyr::filter(type == "CDS") %>% ggplot2::ggplot(ggplot2::aes( xstart = start, xend = end, y = .data[[group_var]] )) + geom_range() + geom_range( data = cds_utr_before %>% dplyr::filter(type == "UTR"), height = 0.25, fill = "white" ) + geom_intron( data = to_intron(cds_utr_before, "transcript_name"), ) after_rescaling <- cds_utr_after %>% dplyr::filter(type == "CDS") %>% ggplot2::ggplot(ggplot2::aes( xstart = start, xend = end, y = .data[[group_var]] )) + geom_range() + geom_range( data = cds_utr_after %>% dplyr::filter(type == "UTR"), height = 0.25, fill = "white" ) + geom_intron( data = to_intron( cds_utr_after %>% dplyr::filter(type != "intron"), "transcript_name" ), ) before_after_list <- list(before_rescaling, after_rescaling) if (add_labels) { for (i in seq_len(length(before_after_list))) { before_after_list[[i]] <- before_after_list[[i]] + ggrepel::geom_label_repel( ggplot2::aes_string( x = "start", y = group_var, label = "start" ), min.segment.length = 0 ) } } before_after_plot <- ggpubr::ggarrange( plotlist = before_after_list, nrow = 2 ) return(before_after_plot) } testthat::test_that( "shorten_gaps works correctly", { test_rescaled_w_utr_plot <- plot_before_after_rescaled( pknox1_cds_utr, pknox1_cds_utr_rescaled, group_var = "transcript_name", add_labels = FALSE ) vdiffr::expect_doppelganger( "test rescaled with utr plot", test_rescaled_w_utr_plot ) } ) ================================================ FILE: tests/testthat/test-geom_half_range.R ================================================ # create dummy exons for testing test_exons <- dplyr::tibble( start = c(100, 300, 500, 650), end = start + 100, strand = c("+", "+", "-", "-"), tx = c("A", "A", "B", "B") ) # create base plot to be used in downstream tests test_exons_plot <- test_exons %>% ggplot2::ggplot(aes( xstart = start, xend = end, y = tx )) ##### geom_half_range ##### testthat::test_that( "geom_half_range() works correctly", { base_geom_half_range <- test_exons_plot + geom_half_range() w_param_geom_half_range <- test_exons_plot + geom_half_range(colour = "red", fill = "blue") w_aes_geom_half_range <- test_exons_plot + geom_half_range(aes(fill = tx)) w_facet_geom_half_range <- test_exons_plot + geom_half_range() + ggplot2::facet_wrap(~tx) vdiffr::expect_doppelganger( "Base geom_half_range plot", base_geom_half_range ) vdiffr::expect_doppelganger( "With param geom_half_range plot", w_param_geom_half_range ) vdiffr::expect_doppelganger( "With aes geom_half_range plot", w_aes_geom_half_range ) vdiffr::expect_doppelganger( "With facet geom_half_range plot", w_facet_geom_half_range ) } ) testthat::test_that( "geom_half_range(range.orientation = x) works correctly", { w_top_geom_half_range <- test_exons_plot + geom_half_range(range.orientation = "top") w_both_geom_half_range <- test_exons_plot + geom_half_range(range.orientation = "top", fill = "red") + geom_half_range(range.orientation = "bottom", fill = "blue") vdiffr::expect_doppelganger( "With top geom_half_range plot", w_top_geom_half_range ) vdiffr::expect_doppelganger( "With both geom_half_range plot", w_both_geom_half_range ) } ) testthat::test_that( "geom_half_range() catches user input errors", { a_range.orientation <- test_exons_plot + geom_half_range(range.orientation = "a") expect_error( print(a_range.orientation), "range.orientation must be one of" ) } ) ================================================ FILE: tests/testthat/test-geom_intron.R ================================================ test_introns <- dplyr::tibble( strand = c("+", "-"), tx = c("A", "B"), start = c(201, 601), end = c(299, 649), type = "intron" ) # create base plot to be used in downstream tests test_introns_plot <- test_introns %>% ggplot2::ggplot(aes( xstart = start, xend = end, y = tx )) ##### geom_intron ##### testthat::test_that( "geom_intron() works correctly", { base_geom_intron <- test_introns_plot + geom_intron() w_param_geom_intron <- test_introns_plot + geom_intron(colour = "blue", linewidth = 2) w_aes_geom_intron <- test_introns_plot + geom_intron(aes(colour = tx, linewidth = c(1L, 2L))) w_facet_geom_intron <- test_introns_plot + geom_intron() + ggplot2::facet_wrap(~tx) vdiffr::expect_doppelganger( "Base geom_intron plot", base_geom_intron ) vdiffr::expect_doppelganger( "With param geom_intron plot", w_param_geom_intron ) vdiffr::expect_doppelganger( "With aes geom_intron plot", w_aes_geom_intron ) vdiffr::expect_doppelganger( "With facet geom_intron plot", w_facet_geom_intron ) } ) testthat::test_that( "geom_intron(strand = x) works correctly", { minus_strand <- test_introns_plot + geom_intron(strand = "-") factor_strand <- test_introns_plot + geom_intron(strand = factor("-")) as_aes_strand <- test_introns_plot + geom_intron(aes(strand = strand)) vdiffr::expect_doppelganger( "Minus strand plot", minus_strand ) vdiffr::expect_doppelganger( "factor strand plot", factor_strand ) vdiffr::expect_doppelganger( "As aes strand plot", as_aes_strand ) } ) testthat::test_that( "geom_intron(arrow.min.intron.length = x) works correctly", { base_arrow.min <- test_introns_plot + geom_intron(arrow.min.intron.length = 50) w_strand_arrow_min <- test_introns_plot + geom_intron(arrow.min.intron.length = 50, strand = "-") vdiffr::expect_doppelganger( "base arrow.min plot", base_arrow.min ) vdiffr::expect_doppelganger( "with strand arrow.min plot", w_strand_arrow_min ) } ) testthat::test_that( "geom_intron() catches strand input errors", { na_strand <- test_introns_plot + geom_intron(strand = c(NA, rep("+", nrow(test_introns) - 1))) a_strand <- test_introns_plot + geom_intron(strand = "a") int_strand <- test_introns_plot + geom_intron(aes(strand = start)) # seems to require print to catch error expect_error( print(na_strand), "strand values must be one of" ) expect_error( print(a_strand), "strand values must be one of" ) expect_error( print(int_strand), "strand values must be one of" ) } ) testthat::test_that( "geom_intron() catches arrow.min.intron.length input errors", { neg_arrow.min <- test_introns_plot + geom_intron(arrow.min.intron.length = -1) chr_arrow.min <- test_introns_plot + geom_intron(arrow.min.intron.length = "1") # seems to require print to catch error expect_error( print(neg_arrow.min), "arrow.min.intron.length must be " ) expect_error( print(chr_arrow.min), "arrow.min.intron.length must be " ) } ) ================================================ FILE: tests/testthat/test-geom_junction.R ================================================ # manually create the expected introns test_introns <- sod1_annotation %>% dplyr::filter(type == "exon") %>% to_intron(group_var = "transcript_name") # create base plot to be used in downstream tests test_introns_plot <- test_introns %>% ggplot2::ggplot(aes( xstart = start, xend = end, y = transcript_name )) ##### geom_junction ##### testthat::test_that( "geom_junction() works correctly", { base_geom_junction <- test_introns_plot + geom_junction() w_param_geom_junction <- test_introns_plot + geom_junction(colour = "red") w_aes_geom_junction <- test_introns_plot + geom_junction(aes(colour = transcript_name)) w_facet_geom_junction <- test_introns_plot + geom_junction() + ggplot2::facet_wrap(~transcript_biotype) vdiffr::expect_doppelganger( "Base geom_junction plot", base_geom_junction ) vdiffr::expect_doppelganger( "With param geom_junction plot", w_param_geom_junction ) vdiffr::expect_doppelganger( "With aes geom_junction plot", w_aes_geom_junction ) vdiffr::expect_doppelganger( "With facet geom_junction plot", w_facet_geom_junction ) } ) testthat::test_that( "geom_junction(junction.orientation = x) works correctly", { top_junction.orientation <- test_introns_plot + geom_junction(junction.orientation = "top") bottom_junction.orientation <- test_introns_plot + geom_junction(junction.orientation = "bottom") w_aes_param_top_junction.orientation <- test_introns_plot + geom_junction( aes(colour = transcript_name), linewidth = 1, junction.orientation = "top" ) vdiffr::expect_doppelganger( "top junction.orientation plot", top_junction.orientation ) vdiffr::expect_doppelganger( "bottom junction.orientation plot", bottom_junction.orientation ) vdiffr::expect_doppelganger( "with aes and param top junction.orientation plot", w_aes_param_top_junction.orientation ) } ) testthat::test_that( "geom_junction(junction.y.max = x) works correctly", { junction.y.max_0.5 <- test_introns_plot + geom_junction(junction.y.max = 0.5) w_aes_param_junction.y.max_0.5 <- test_introns_plot + geom_junction( aes(colour = transcript_name), linewidth = 1, junction.y.max = 0.5 ) w_facet_junction.y.max_0.5 <- test_introns_plot + geom_junction(junction.y.max = 0.5) + ggplot2::facet_wrap(~transcript_biotype) vdiffr::expect_doppelganger( "0.5 junction.y.max plot", junction.y.max_0.5 ) vdiffr::expect_doppelganger( "with aes and param 0.5 junction.y.max plot", w_aes_param_junction.y.max_0.5 ) vdiffr::expect_doppelganger( "with facet 0.5 junction.y.max plot", w_facet_junction.y.max_0.5 ) } ) testthat::test_that( "geom_junction() catches junction.orientation input errors", { a_junction.orientation <- test_introns_plot + geom_junction(junction.orientation = "a") expect_error( print(a_junction.orientation), "junction.orientation must be one of" ) } ) testthat::test_that( "geom_junction() catches junction.y.max input errors", { len_2_junction.y.max <- test_introns_plot + geom_junction(junction.y.max = c(1, 2)) a_junction.y.max <- test_introns_plot + geom_junction(junction.y.max = "a") expect_error( print(len_2_junction.y.max), "junction.y.max must have a length of 1" ) expect_error( print(a_junction.y.max), "junction.y.max must be a numeric value" ) } ) ================================================ FILE: tests/testthat/test-geom_junction_label_repel.R ================================================ # manually create the expected introns test_introns <- sod1_annotation %>% dplyr::filter( type == "exon", transcript_name %in% c("SOD1-201", "SOD1-202") ) %>% to_intron(group_var = "transcript_name") %>% dplyr::mutate( count = dplyr::row_number() ) # create base plot to be used in downstream tests test_introns_plot <- test_introns %>% ggplot2::ggplot(aes( xstart = start, xend = end, y = transcript_name )) ##### geom_junction_label_repel ##### testthat::test_that( "geom_junction() works correctly", { base_geom_junction_labels <- test_introns_plot + geom_junction() + geom_junction_label_repel( aes(label = count), seed = 32 ) w_param_geom_junction_labels <- test_introns_plot + geom_junction( junction.y.max = 0.5 ) + geom_junction_label_repel( aes(label = count), junction.y.max = 0.5, seed = 32 ) w_aes_geom_junction_labels <- test_introns_plot + geom_junction(aes(colour = transcript_name)) + geom_junction_label_repel( aes( label = count, colour = transcript_name ), seed = 32 ) w_facet_geom_junction_labels <- test_introns_plot + geom_junction() + geom_junction_label_repel( aes(label = count), seed = 32 ) + ggplot2::facet_wrap(transcript_name ~ ., drop = TRUE) vdiffr::expect_doppelganger( "Base geom_junction_label_repel plot", base_geom_junction_labels ) vdiffr::expect_doppelganger( "With param geom_junction_label_repel plot", w_param_geom_junction_labels ) vdiffr::expect_doppelganger( "With aes geom_junction_label_repel plot", w_aes_geom_junction_labels ) vdiffr::expect_doppelganger( "With facet geom_junction_label_repel plot", w_facet_geom_junction_labels ) } ) ================================================ FILE: tests/testthat/test-geom_range.R ================================================ # create dummy exons for testing test_exons <- dplyr::tibble( start = c(100, 300, 500, 650), end = start + 100, strand = c("+", "+", "-", "-"), tx = c("A", "A", "B", "B") ) # create base plot to be used in downstream tests test_exons_plot <- test_exons %>% ggplot2::ggplot(aes( xstart = start, xend = end, y = tx )) ##### geom_range ##### testthat::test_that( "geom_range() works correctly", { base_geom_range <- test_exons_plot + geom_range() w_param_geom_range <- test_exons_plot + geom_range(colour = "red", fill = "blue") w_aes_geom_range <- test_exons_plot + geom_range(aes(fill = tx)) w_facet_geom_range <- test_exons_plot + geom_range() + ggplot2::facet_wrap(~tx) vdiffr::expect_doppelganger( "Base geom_range plot", geom_range ) vdiffr::expect_doppelganger( "With param geom_range plot", w_param_geom_range ) vdiffr::expect_doppelganger( "With aes geom_range plot", w_aes_geom_range ) vdiffr::expect_doppelganger( "With facet geom_range plot", w_facet_geom_range ) } ) testthat::test_that( "geom_range(vjust = x) works correctly", { w_vjust_geom_range <- test_exons_plot + geom_range(vjust = 1.5, height = 0.25) vdiffr::expect_doppelganger( "With vjust geom_range plot", w_vjust_geom_range ) } ) ================================================ FILE: tests/testthat/test-shorten_gaps.R ================================================ test_exons <- dplyr::tibble( seqnames = "1", start = c(100, 300, 500, 650), end = start + 100, strand = "+", tx = c("A", "A", "B", "B") ) pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") pknox1_introns <- pknox1_exons %>% to_intron("transcript_name") ##### .get_gaps ##### # need to create gaps globally for downstream tests pknox1_intron_gaps <- .get_gaps(GenomicRanges::GRanges(pknox1_exons)) test_intron_gaps <- .get_gaps(GenomicRanges::GRanges(test_exons)) test_.get_gaps <- function(exons, intron_gaps) { # intron_gaps should not overlap any exons exons_gap_hits <- GenomicRanges::findOverlaps( GenomicRanges::GRanges(exons), intron_gaps ) overlap_exons <- length(exons_gap_hits) == 0 return(overlap_exons) } testthat::test_that(".get_gaps() works correctly", { expect_true(test_.get_gaps( pknox1_exons, pknox1_intron_gaps )) expect_true(test_.get_gaps( test_exons, test_intron_gaps )) }) ##### .get_tx_start_gaps ##### pknox1_tx_start_gaps <- .get_tx_start_gaps(pknox1_exons, "transcript_name") test_exons_tx_start_gaps <- .get_tx_start_gaps(test_exons, NULL) test_.get_tx_start_gaps <- function(exons, tx_start_gaps, group_var) { unique_start <- length(unique(tx_start_gaps[["start"]])) == 1 correct_start <- all(tx_start_gaps[["start"]] == min(exons[["start"]])) correct_end <- exons %>% dplyr::group_by_at(.vars = group_var) %>% dplyr::summarise(tx_start = min(start)) correct_end <- all(tx_start_gaps[["end"]] == correct_end[["tx_start"]]) correct_all <- all( unique_start, correct_start, correct_end ) return(correct_all) } testthat::test_that(".get_tx_start_gaps() works correctly", { expect_true(test_.get_tx_start_gaps( pknox1_exons, pknox1_tx_start_gaps, "transcript_name" )) expect_true(test_.get_tx_start_gaps( test_exons, test_exons_tx_start_gaps, NULL )) }) ##### .check_len_1_strand_seqnames ##### testthat::test_that( ".check_len_1_strand_seqnames() catches user input errors", { expect_error( .check_len_1_strand_seqnames(1:2, 1), "seqnames of object contains more than 1 unique value" ) expect_error( .check_len_1_strand_seqnames(1, 1:2), "strand of object contains more than 1 unique value" ) } ) ##### .check_type ##### testthat::test_that(".get_type() works correctly", { added_type_exons <- pknox1_exons %>% dplyr::select(-type) %>% .get_type("exons") %>% .[["type"]] added_type_introns <- pknox1_exons %>% dplyr::select(-type) %>% .get_type("introns") %>% .[["type"]] expect_true( all(added_type_exons == "exon") ) expect_true( all(added_type_introns == "intron") ) expect_identical( .get_type(pknox1_exons, "exons"), pknox1_exons ) expect_identical( .get_type(pknox1_introns, "introns"), pknox1_introns ) }) testthat::test_that(".get_type() catches user input errors", { expect_error( .get_type(pknox1_exons, "introns"), "values in the 'type' column of introns must be one of:" ) }) ##### .check_target_gap_width ##### testthat::test_that(".check_target_gap_width() catches user input errors", { expect_warning( .check_target_gap_width(100), "target_gap_width must be an integer, coercing" ) }) ##### shorten_gaps ##### # also using this to test drop_orig_coords pknox1_rescaled_tx <- shorten_gaps( pknox1_exons, pknox1_introns, group_var = "transcript_name", target_gap_width = 100L ) pknox1_exons_1_tx <- pknox1_exons %>% dplyr::filter(transcript_name == "PKNOX1-202") pknox1_introns_1_tx <- pknox1_introns %>% dplyr::filter(transcript_name == "PKNOX1-202") pknox1_rescaled_1_tx <- shorten_gaps( pknox1_exons_1_tx, pknox1_introns_1_tx, group_var = "transcript_name", target_gap_width = 100L ) pknox1_rescaled_1_tx_no_group <- shorten_gaps( pknox1_exons_1_tx, pknox1_introns_1_tx, group_var = NULL, target_gap_width = 100L ) test_rescaled_tx <- shorten_gaps( test_exons, to_intron(test_exons, "tx"), group_var = "tx", target_gap_width = 50L ) testthat::test_that("shorten_gaps() keeps existing columns", { expect_true(!is.null(pknox1_rescaled_tx[["transcript_biotype"]])) expect_true(!is.null( pknox1_rescaled_1_tx_no_group[["transcript_biotype"]] )) }) testthat::test_that("shorten_gaps() takes user inputted type", { # for test, we modify all exons types to "utr" all_utr <- shorten_gaps( pknox1_exons %>% dplyr::mutate(type = "utr"), pknox1_introns, group_var = "transcript_name", target_gap_width = 100L ) expect_true(all(pknox1_rescaled_tx[["type"]] %in% c("exon", "intron"))) expect_true(all(all_utr[["type"]] %in% c("utr", "intron"))) }) test_shorten_gaps <- function(exons, rescaled_tx) { # should never shorten exons exon_widths_before <- exons[["end"]] - exons[["start"]] exon_widths_after <- rescaled_tx %>% dplyr::filter(type == "exon") %>% dplyr::mutate(width = end - start) %>% .[["width"]] unchanged_exon_widths <- all.equal( sort(exon_widths_before), sort(exon_widths_after) ) return(unchanged_exon_widths) } testthat::test_that("shorten_gaps() never modifies exons", { expect_true(test_shorten_gaps( pknox1_exons, pknox1_rescaled_tx )) expect_true(test_shorten_gaps( pknox1_exons_1_tx, pknox1_rescaled_1_tx )) expect_true(test_shorten_gaps( pknox1_exons_1_tx, pknox1_rescaled_1_tx_no_group )) expect_true(test_shorten_gaps( test_exons, test_rescaled_tx )) }) # add labels helps manual checking plot_rescaled_tx <- function(exons, rescaled_tx, group_var, add_labels = FALSE) { before_rescaling <- exons %>% ggplot2::ggplot(ggplot2::aes( xstart = start, xend = end, y = .data[[group_var]] )) + geom_range() + geom_intron( data = to_intron(exons, group_var), strand = "-", arrow.min.intron.length = 500 ) after_rescaling <- rescaled_tx %>% dplyr::filter(type == "exon") %>% ggplot2::ggplot(ggplot2::aes( xstart = start, xend = end, y = .data[[group_var]] )) + geom_range() + geom_intron( data = rescaled_tx %>% dplyr::filter(type == "intron"), strand = "-", arrow.min.intron.length = 500 ) before_after_list <- list(before_rescaling, after_rescaling) if (add_labels) { for (i in seq_len(length(before_after_list))) { before_after_list[[i]] <- before_after_list[[i]] + ggrepel::geom_label_repel( ggplot2::aes( x = end, y = .data[[group_var]], label = end ), linewidth = 2, min.segment.length = 0 ) } } before_after_plot <- ggpubr::ggarrange( plotlist = before_after_list, nrow = 2 ) return(before_after_plot) } testthat::test_that( "shorten_gaps works correctly", { test_rescaled_plot <- plot_rescaled_tx( test_exons, test_rescaled_tx, "tx" ) pknox1_rescaled_plot <- plot_rescaled_tx( pknox1_exons, pknox1_rescaled_tx, "transcript_name" ) pknox1_rescaled_plot_1_tx <- plot_rescaled_tx( pknox1_exons_1_tx, pknox1_rescaled_1_tx, "transcript_name" ) # make sure everything works okay even if group is set to NULL pknox1_plot_1_tx_no_group <- plot_rescaled_tx( pknox1_exons_1_tx, pknox1_rescaled_1_tx_no_group, "transcript_name" ) vdiffr::expect_doppelganger( "test exons rescaled plot", test_rescaled_plot ) vdiffr::expect_doppelganger( "pknox1 rescaled plot", pknox1_rescaled_plot ) vdiffr::expect_doppelganger( "pknox1 rescaled plot 1 tx", pknox1_rescaled_plot_1_tx ) vdiffr::expect_doppelganger( "pknox1 rescaled plot 1 tx no group", pknox1_plot_1_tx_no_group ) } ) ================================================ FILE: tests/testthat/test-to_diff.R ================================================ sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") mane <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201") single_tx <- sod1_exons %>% dplyr::filter(transcript_name %in% c("SOD1-202")) multi_tx <- sod1_exons %>% dplyr::filter(transcript_name %in% c("SOD1-202", "SOD1-203", "SOD1-204")) ##### to_diff ##### testthat::test_that("to_diff() works correctly", { test_diffs <- to_diff( exons = single_tx, ref_exons = mane ) expect_true(is.data.frame(test_diffs)) expect_true(nrow(test_diffs) > 0) expect_true(all( c("seqnames", "start", "end", "strand", "type", "diff_type") %in% colnames(test_diffs) )) }) testthat::test_that("to_diff() works correctly for single transcripts", { test_diffs <- to_diff( exons = single_tx, ref_exons = mane, group_var = "transcript_name" ) # think the easiest way to check diffs is via plotting single_tx_diff_plot <- mane %>% dplyr::bind_rows(single_tx) %>% ggplot2::ggplot( aes( xstart = start, xend = end, y = transcript_name ) ) + geom_range() + geom_range( data = test_diffs, alpha = 0.2, fill = "red" ) vdiffr::expect_doppelganger( "single tx diff plot", single_tx_diff_plot ) }) testthat::test_that("to_diff() works correctly for multiple transcripts", { test_diffs <- to_diff( exons = multi_tx, ref_exons = mane, group_var = "transcript_name" ) multi_tx_diff_plot <- mane %>% dplyr::bind_rows(multi_tx) %>% ggplot2::ggplot( aes( xstart = start, xend = end, y = transcript_name ) ) + geom_range() + geom_range( data = test_diffs, aes(fill = diff_type), alpha = 0.2, ) vdiffr::expect_doppelganger( "multi tx diff plot", multi_tx_diff_plot ) }) ================================================ FILE: tests/testthat/test-to_intron.R ================================================ # create dummy exons for testing test_exons <- dplyr::tibble( start = c(100, 300, 500, 650), end = start + 100, strand = c("+", "+", "-", "-"), tx = c("A", "A", "B", "B") ) # manually create the expected introns test_introns <- dplyr::tibble( strand = c("+", "-"), tx = c("A", "B"), start = c(200, 600), end = c(300, 650), type = "intron" ) pknox1_cds_utr <- pknox1_annotation %>% dplyr::filter( type == "CDS" | grepl("utr", type) ) ##### to_intron ##### testthat::test_that("to_intron() obtains introns correctly", { # with group_var expect_equal( test_introns, test_exons %>% to_intron(group_var = "tx") ) # without group_var expect_equal( test_introns %>% dplyr::filter(tx != "B"), test_exons %>% dplyr::filter(tx != "B") %>% to_intron() ) }) testthat::test_that( "to_intron() obtains introns correctly, regardless of exon order", { set.seed(32) expect_equal( test_introns, test_exons %>% .[sample(seq_len(nrow(test_exons))), ] %>% to_intron(group_var = "tx") ) } ) ================================================ FILE: tests/testthat/test-utils.R ================================================ # create dummy exons for testing test_exons <- dplyr::tibble( start = c(100, 300, 500, 650), end = start + 100, strand = c("+", "+", "-", "-"), tx = c("A", "A", "B", "B") ) ##### .check_coord_object ##### testthat::test_that(".check_coord_object() works correctly", { expect_equal( .check_coord_object(test_exons), NULL ) }) testthat::test_that(".check_coord_object() catches user input errors", { expect_error( .check_coord_object("1"), "must be a data.frame" ) expect_error( .check_coord_object(test_exons %>% dplyr::select(-start)), "must have the columns" ) expect_error( .check_coord_object(test_exons %>% dplyr::select(-end)), "must have the columns" ) expect_error( .check_coord_object(test_exons, check_seqnames = TRUE), "must have the column" ) expect_error( .check_coord_object( test_exons %>% dplyr::select(-strand), check_strand = TRUE ), "must have the column" ) }) ##### .check_group_var ##### testthat::test_that(".check_group_var() works correctly", { expect_equal( .check_group_var(test_exons, group_var = NULL), NULL ) expect_equal( .check_group_var(test_exons, group_var = "tx"), NULL ) }) testthat::test_that(".check_group_var() catches user input errors", { expect_error( .check_group_var(test_exons, "not_a_col"), "must be a column in" ) }) ================================================ FILE: tests/testthat.R ================================================ library(testthat) library(ggtranscript) test_check("ggtranscript") ================================================ FILE: vignettes/.gitignore ================================================ *.html *.R ================================================ FILE: vignettes/ggtranscript.Rmd ================================================ --- title: "Getting started" author: - name: David Zhang affiliation: - UCL email: dyzhang32@gmail.com output: BiocStyle::html_document: self_contained: yes toc: true toc_float: true toc_depth: 2 code_folding: show package: "`r pkg_ver('ggtranscript')`" vignette: > %\VignetteIndexEntry{Introduction to ggtranscript} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL ) ``` ```{r load-ggtranscript} library(magrittr) library(dplyr) library(ggtranscript) library(ggplot2) library(rtracklayer) ``` `ggtranscript` is designed to make it easy to visualize transcript structure and annotation using `ggplot2`. As the intended users are those who work with genetic and/or transcriptomic data in `R`, this tutorial assumes a basic understanding of transcript annotation and familiarity with `ggplot2`.
## Input data ### Example data In order to showcase the package's functionality, `ggtranscript` includes example transcript annotation for the genes *SOD1* and *PKNOX1*, as well as a set of unannotated junctions associated with *SOD1*. These specific genes are unimportant, chosen arbitrarily for illustration. However, it worth noting that the input data for `ggtranscript`, as a `ggplot2` extension, is required be a [`data.frame`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/data.frame) or [`tibble`](https://tibble.tidyverse.org). ```{r example-data} sod1_annotation %>% head() pknox1_annotation %>% head() sod1_junctions ``` ### Importing data from a gtf You may be asking, what if I have a `gtf` file or a `GRanges` object? The below demonstrates how to wrangle a `gtf` into the required format for `ggtranscript` and extract the relevant annotation for a particular gene of interest. For the purposes of the vignette, here we download a `gtf` (Ensembl version 105), then load the `gtf` into `R` using `rtracklayer::import()`. ```{r import-gtf-data} # download ens 105 gtf into a temporary directory gtf_path <- file.path(tempdir(), "Homo_sapiens.GRCh38.105.chr.gtf.gz") download.file( paste0( "http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/", "Homo_sapiens.GRCh38.105.chr.gtf.gz" ), destfile = gtf_path ) gtf <- rtracklayer::import(gtf_path) class(gtf) ``` To note, the loaded `gtf` is a `GRanges` class object. The input data for `ggtranscript`, as a `ggplot2` extension, is required be a [`data.frame`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/data.frame) or [`tibble`](https://tibble.tidyverse.org). We can convert a `GRanges` to a `data.frame` using `as.data.frame` or a `tibble` via `dplyr::as_tibble()`. Either is fine with respect to `ggtranscript`, however we prefer `tibble`s over `data.frame`s for several [reasons](https://r4ds.had.co.nz/tibbles.html#tibbles-vs.-data.frame). ```{r convert-gtf-df} gtf <- gtf %>% dplyr::as_tibble() class(gtf) ``` Now that the `gtf` is a `tibble` (or `data.frame` object), we can `dplyr::filter()` rows and `dplyr::select()` columns to keep the annotation columns required for any specific gene of interest. Here, we illustrate how you would obtain the annotation for the gene *SOD1*, ready for plotting with `ggtranscript`. ```{r get-sod1-annot} # filter your gtf for the gene of interest, here "SOD1" gene_of_interest <- "SOD1" sod1_annotation_from_gtf <- gtf %>% dplyr::filter( !is.na(gene_name), gene_name == gene_of_interest ) # extract the required annotation columns sod1_annotation_from_gtf <- sod1_annotation_from_gtf %>% dplyr::select( seqnames, start, end, strand, type, gene_name, transcript_name, transcript_biotype ) sod1_annotation_from_gtf %>% head() ``` ### Importing data from a bed file If users would like to plot ranges from a `bed` file using `ggtranscript`, they can first import the `bed` file into `R` using `rtracklayer::import.bed()`. This method will create a `UCSCData` object. ```{r import-bed-data} # for the example, we'll use the test bed file provided by rtracklayer test_bed <- system.file("tests/test.bed", package = "rtracklayer") bed <- rtracklayer::import.bed(test_bed) class(bed) ``` A `UCSCData` object can be coerced into a `tibble`, a data structure which can be plotted using `ggplot2`/`ggtranscript`, using `dplyr::as_tibble()`. ```{r convert-bed-df} bed <- bed %>% dplyr::as_tibble() class(bed) bed %>% head() ```
## Basic usage ### Required aesthetics `ggtranscript` introduces 5 new geoms designed to simplify the visualization of transcript structure and annotation; `geom_range()`, `geom_half_range()`, `geom_intron()`, `geom_junction()` and `geom_junction_label_repel()`. The required aesthetics (`aes()`) for these geoms are designed to match the data formats which are widely used in genetic and transcriptomic analyses: ```{r geom-aes, echo = FALSE} dplyr::tribble( ~`Required aes()`, ~Type, ~Description, ~`Required by`, #-------|-----|----|-------- "xstart", "integer", "Start position in base pairs", "All geoms", "xend", "integer", "End position in base pairs", "All geoms", "y", "charactor or factor", "The group used for the y axis, most often a transcript id or name ", "All geoms", "label", "integer or charactor", "Variable used to label junction curves", "Only geom_junction_label_repel()", ) %>% knitr::kable() ``` ### Plotting exons and introns {#plotting_exons_and_introns} In the simplest case, the core components of transcript structure are exons and introns, which can be plotted using `geom_range()` and `geom_intron()`. In order to facilitate this, `ggtranscript` also provides `to_intron()`, which converts exon co-ordinates into introns. Therefore, you can plot transcript structures with only exons as input and just a few lines of code. > 📌: As `ggtranscript` geoms share required aesthetics, it is recommended to set these via `ggplot()`, rather than in the individual `geom_*()` calls to avoid code duplication. ```{r geom-range-intron} # to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation %>% head() # extract exons sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") sod1_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(sod1_exons, "transcript_name"), aes(strand = strand) ) ``` ### Differentiating UTRs from the coding sequence As suggested by it's name, `geom_range()` is designed to visualize range-based transcript annotation. This includes but is not limited to exons. For instance, for protein coding transcripts it can be useful to visually distinguish the coding sequence (CDS) of a transcript from it's UTRs. This can be achieved by adjusting the height and fill of `geom_range()` and overlaying the CDS on top of the exons (including UTRs). ```{r geom-range-intron-w-cds} # filter for only exons from protein coding transcripts sod1_exons_prot_cod <- sod1_exons %>% dplyr::filter(transcript_biotype == "protein_coding") # obtain cds sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS") sod1_exons_prot_cod %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_cds ) + geom_intron( data = to_intron(sod1_exons_prot_cod, "transcript_name"), aes(strand = strand), arrow.min.intron.length = 500, ) ``` ### Plotting junctions `geom_junction()` plots curved lines that are intended to represent junction reads. Junctions are reads obtained through RNA-sequencing (RNA-seq) data that map with gapped alignment to the genome. Often, this gap is indicative of a splicing event, but can also originate from other genomic events such as indels. It can be useful to visually overlay junctions on top of an existing transcript structure. For example, this can help to understand which existing transcripts are expressed in the RNA-seq sample or inform the location or interpretation of the novel splice sites. `geom_junction_label_repel()` adds labels to junction curves. This can useful for labeling junctions with a measure of their expression or support such as read counts or percent-spliced-in. Alternatively, you may choose to visually map this measure to the thickness of the junction curves by adjusting the the size `aes()`. Or, as shown below, both of these options can be combined. ```{r geom-junction, fig.height = 3} # extract exons and cds for the MANE-select transcript sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201") sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201") # add transcript name column to junctions for plotting sod1_junctions <- sod1_junctions %>% dplyr::mutate(transcript_name = "SOD1-201") sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_junction( data = sod1_junctions, aes(size = mean_count), junction.y.max = 0.5 ) + geom_junction_label_repel( data = sod1_junctions, aes(label = round(mean_count, 2)), junction.y.max = 0.5 ) + scale_size_continuous(range = c(0.1, 1)) ```
## Visualizing transcript structure differences ### Context One of the primary reasons for visualizing transcript structures is to better observe the differences between them. Often this can be achieved by simply plotting the exons and introns as shown in [basic usage](#plotting_exons_and_introns). However, for longer, complex transcripts this may not be as straight forward. For example, the transcripts of *PKNOX1* have relatively long introns, which makes the comparison between transcript structures (especially small differences in exons) more difficult. > 📌: For relatively short introns, strand arrows may overlap exons. In such cases, the `arrow.min.intron.length` parameter of `geom_intron()` can be used to set the minimum intron length for a strand arrow to be plotted. ```{r transcript-diff-base} # extract exons pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") pknox1_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(pknox1_exons, "transcript_name"), aes(strand = strand), arrow.min.intron.length = 3500 ) ``` ### Improving transcript structure visualisation using `shorten_gaps()` `ggtranscript` provides the helper function `shorten_gaps()`, which reduces the size of the gaps (regions that do not overlap an exon). `shorten_gaps()` then rescales the exon and intron co-ordinates, preserving the original exon alignment. This allows you to hone in the differences of interest, such as the exonic structure. > 📌: The rescaled co-ordinates returned by `shorten_gaps()` will not match the original genomic positions. Therefore, it is recommended that `shorten_gaps()` is used for visualizations purposes only. ```{r shorten-gaps} # extract exons pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") pknox1_rescaled <- shorten_gaps( exons = pknox1_exons, introns = to_intron(pknox1_exons, "transcript_name"), group_var = "transcript_name" ) # shorten_gaps() returns exons and introns all in one data.frame() # let's split these for plotting pknox1_rescaled_exons <- pknox1_rescaled %>% dplyr::filter(type == "exon") pknox1_rescaled_introns <- pknox1_rescaled %>% dplyr::filter(type == "intron") pknox1_rescaled_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = pknox1_rescaled_introns, aes(strand = strand), arrow.min.intron.length = 300 ) ``` ### Comparing between two transcripts using `geom_half_range()` If you are interested in the differences between two transcripts, you can use `geom_half_range()` whilst adjusting `range.orientation` to plot the exons from each on the opposite sides of the transcript structure. This can reveal small differences in exon structure, such as those observed here at the 5' ends of *PKNOX1-201* and *PKNOX1-203*. ```{r geom-half-range, fig.height = 3} # extract the two transcripts to be compared pknox1_rescaled_201_exons <- pknox1_rescaled_exons %>% dplyr::filter(transcript_name == "PKNOX1-201") pknox1_rescaled_203_exons <- pknox1_rescaled_exons %>% dplyr::filter(transcript_name == "PKNOX1-203") pknox1_rescaled_201_exons %>% ggplot(aes( xstart = start, xend = end, y = "PKNOX1-201/203" )) + geom_half_range() + geom_intron( data = to_intron(pknox1_rescaled_201_exons, "transcript_name"), arrow.min.intron.length = 300 ) + geom_half_range( data = pknox1_rescaled_203_exons, range.orientation = "top", fill = "purple" ) + geom_intron( data = to_intron(pknox1_rescaled_203_exons, "transcript_name"), arrow.min.intron.length = 300 ) ``` ### Comparing many transcripts to a single reference transcript using `to_diff()` Sometimes, it can be useful to visualize the differences of several transcripts with respect to one transcript. For example, you may be interested in how other transcripts differ in structure to the MANE-select transcript. This exploration can reveal whether certain important regions are missing or novel regions are added, hinting at differences in transcript function. `to_diff()` is a helper function designed for this situation. Here, we apply this to *PKNOX1*, finding the differences between all other transcripts and the MANE-select transcript (*PKNOX1-201*). > 📌: Although below, we apply `to_diff()` to the rescaled exons and intron (outputted by `shorten_gaps()`), `to_diff()` can also be applied to the original, unscaled transcripts with the same effect. ```{r to-diff} mane <- pknox1_rescaled_201_exons not_mane <- pknox1_rescaled_exons %>% dplyr::filter(transcript_name != "PKNOX1-201") pknox1_rescaled_diffs <- to_diff( exons = not_mane, ref_exons = mane, group_var = "transcript_name" ) pknox1_rescaled_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = pknox1_rescaled_introns, arrow.min.intron.length = 300 ) + geom_range( data = pknox1_rescaled_diffs, aes(fill = diff_type), alpha = 0.2 ) ```
## Integrating existing `ggplot2` functionality As a `ggplot2` extension, `ggtranscript` inherits `ggplot2`'s familiarity and flexibility, enabling users to intuitively adjust aesthetics, parameters, scales etc as well as complement `ggtranscript` geoms with existing `ggplot2` geoms to create informative, publication-ready plots. Below is a list outlining some examples of complementing `ggtranscript` with existing `ggplot2` functionality that we have found useful: - Adding exon annotation such as [exon number/order](https://dzhang32.github.io/ggtranscript/reference/add_exon_number.html) using `add_exon_number()` and `geom_text()` ```{r exon-num-ex} base_sod1_plot <- sod1_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(sod1_exons, "transcript_name"), aes(strand = strand) ) base_sod1_plot + geom_text( data = add_exon_number(sod1_exons, "transcript_name"), aes( x = (start + end) / 2, # plot label at midpoint of exon label = exon_number ), size = 3.5, nudge_y = 0.4 ) ``` - Zooming in on areas of interest using `coord_cartesian()` or `ggforce::facet_zoom()` ```{r zoom-ex} base_sod1_plot + coord_cartesian(xlim = c(31665500, 31669000)) ``` - Plotting mutations using `geom_vline()` ```{r mutation-ex} example_mutation <- dplyr::tibble( transcript_name = "SOD1-204", position = 31661600 ) # xstart and xend are set here to override the default aes() base_sod1_plot + geom_vline( data = example_mutation, aes( xintercept = position, xstart = NULL, xend = NULL ), linetype = 2, colour = "red" ) ``` - Beautifying plots using themes and scales ```{r beautify-ex} base_sod1_plot + theme_bw() + scale_x_continuous(name = "Position") + scale_y_discrete(name = "Transcript name") + scale_fill_discrete( name = "Transcript biotype", labels = c("Processed transcript", "Protein-coding") ) ```
## Session info
Show/hide ```{r session-info, echo = FALSE} library("sessioninfo") options(width = 120) session_info() ```