Full Code of dzhang32/ggtranscript for AI

master 682a0df688ad cached
62 files
227.7 KB
64.8k tokens
1 requests
Download .txt
Showing preview only (243K chars total). Download the full file or copy to clipboard to get everything.
Repository: dzhang32/ggtranscript
Branch: master
Commit: 682a0df688ad
Files: 62
Total size: 227.7 KB

Directory structure:
gitextract_nqf_jr3r/

├── .Rbuildignore
├── .github/
│   ├── .gitignore
│   └── workflows/
│       └── check-bioc.yml
├── .gitignore
├── .pre-commit-config.yaml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│   ├── add_exon_number.R
│   ├── add_utr.R
│   ├── data.R
│   ├── geom_half_range.R
│   ├── geom_intron.R
│   ├── geom_junction.R
│   ├── geom_junction_label_repel.R
│   ├── geom_range.R
│   ├── ggtranscript-package.R
│   ├── globals.R
│   ├── shorten_gaps.R
│   ├── to_diff.R
│   ├── to_intron.R
│   └── utils.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── codecov.yml
├── data/
│   ├── pknox1_annotation.rda
│   ├── sod1_annotation.rda
│   └── sod1_junctions.rda
├── data-raw/
│   ├── ggplot2_exts_thumbnail.R
│   ├── ggtranscript_logo.R
│   ├── sod1_junctions.R
│   └── sod1_pknox1_annotation.R
├── inst/
│   └── CITATION
├── man/
│   ├── add_exon_number.Rd
│   ├── add_utr.Rd
│   ├── geom_intron.Rd
│   ├── geom_junction.Rd
│   ├── geom_junction_label_repel.Rd
│   ├── geom_range.Rd
│   ├── ggtranscript.Rd
│   ├── shorten_gaps.Rd
│   ├── sod1_annotation.Rd
│   ├── sod1_junctions.Rd
│   ├── to_diff.Rd
│   └── to_intron.Rd
├── tests/
│   ├── testthat/
│   │   ├── test-add_exon_number.R
│   │   ├── test-add_utr.R
│   │   ├── test-geom_half_range.R
│   │   ├── test-geom_intron.R
│   │   ├── test-geom_junction.R
│   │   ├── test-geom_junction_label_repel.R
│   │   ├── test-geom_range.R
│   │   ├── test-shorten_gaps.R
│   │   ├── test-to_diff.R
│   │   ├── test-to_intron.R
│   │   └── test-utils.R
│   └── testthat.R
└── vignettes/
    ├── .gitignore
    └── ggtranscript.Rmd

================================================
FILE CONTENTS
================================================

================================================
FILE: .Rbuildignore
================================================
^ggtranscript\.Rproj$
^\.Rproj\.user$
^dev$
^README\.Rmd$
^\.github$
^codecov\.yml$
^.pre-commit-config.yaml$
^data-raw$
^_pkgdown.yml$
^tests/testthat/_snaps/*
^LICENSE\.md$
^cran-comments\.md$


================================================
FILE: .github/.gitignore
================================================
*.html


================================================
FILE: .github/workflows/check-bioc.yml
================================================
on:
  push:
  pull_request:

name: R-CMD-check-bioc

## These environment variables control whether to run GHA code later on that is
## specific to testthat, covr, and pkgdown.
env:
  has_testthat: 'true'
  run_covr: 'true'
  run_pkgdown: 'true'
  has_RUnit: 'false'
  cache-version: 'cache-v1'

jobs:
  build-check:
    runs-on: ${{ matrix.config.os }}
    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
    container: ${{ matrix.config.cont }}

    strategy:
      fail-fast: false
      matrix:
        config:
          - { os: ubuntu-latest, r: '4.4', bioc: '3.19', cont: "bioconductor/bioconductor_docker:RELEASE_3_19", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
          
    env:
      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
      RSPM: ${{ matrix.config.rspm }}
      NOT_CRAN: true
      TZ: UTC
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

    steps:

      - name: Set R Library home on Linux
        if: runner.os == 'Linux'
        run: |
          mkdir /__w/_temp/Library
          echo ".libPaths('/__w/_temp/Library')" > ~/.Rprofile

      - name: Checkout Repository
        uses: actions/checkout@v3

      - name: Setup R from r-lib
        if: runner.os != 'Linux'
        uses: r-lib/actions/setup-r@v2
        with:
          r-version: ${{ matrix.config.r }}
          http-user-agent: ${{ matrix.config.http-user-agent }}

      - name: Setup pandoc from r-lib
        if: runner.os != 'Linux'
        uses: r-lib/actions/setup-pandoc@v2

      - name: Query dependencies
        run: |
          install.packages('remotes')
          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
        shell: Rscript {0}

      - name: Restore R package cache
        if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'"
        uses: actions/cache@v3
        with:
          path: ${{ env.R_LIBS_USER }}
          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-${{ hashFiles('.github/depends.Rds') }}
          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-

      - name: Cache R packages on Linux
        if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
        uses: actions/cache@v3
        with:
          path: /home/runner/work/_temp/Library
          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-${{ hashFiles('.github/depends.Rds') }}
          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-

      - name: Install Linux system dependencies
        if: runner.os == 'Linux'
        run: |
          sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))')
          echo $sysreqs
          sudo -s eval "$sysreqs"

      - name: Install BiocManager
        run: |
          message(paste('****', Sys.time(), 'installing BiocManager ****'))
          remotes::install_cran("BiocManager")
        shell: Rscript {0}

      - name: Set BiocVersion
        run: |
          BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE)
        shell: Rscript {0}

      - name: Install dependencies pass 1
        run: |
          ## Try installing the package dependencies in steps. First the local
          ## dependencies, then any remaining dependencies to avoid the
          ## issues described at
          ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html
          ## https://github.com/r-lib/remotes/issues/296
          ## Ideally, all dependencies should get installed in the first pass.

          ## Set the repos source depending on the OS
          ## Alternatively use https://storage.googleapis.com/bioconductor_docker/packages/
          ## though based on https://bit.ly/bioc2021-package-binaries
          ## the Azure link will be the main one going forward.
          gha_repos <- if(
              .Platform$OS.type == "unix" && Sys.info()["sysname"] != "Darwin"
          ) c(
              "AnVIL" = "https://bioconductordocker.blob.core.windows.net/packages/3.14/bioc",
              BiocManager::repositories()
              ) else BiocManager::repositories()

          ## For running the checks
          message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****'))
          install.packages(c("rcmdcheck", "BiocCheck"), repos = gha_repos)

          ## Pass #1 at installing dependencies
          ## This pass uses AnVIL-powered fast binaries
          ## details at https://github.com/nturaga/bioc2021-bioconductor-binaries
          ## The speed gains only apply to the docker builds.
          message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
          remotes::install_local(dependencies = TRUE, repos = gha_repos, build_vignettes = FALSE, upgrade = TRUE)
        continue-on-error: true
        shell: Rscript {0}

      - name: Install dependencies pass 2
        run: |
          ## Pass #2 at installing dependencies
          ## This pass does not use AnVIL and will thus update any packages
          ## that have seen been updated in Bioconductor
          message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
          remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE)
        shell: Rscript {0}

      - name: Install BiocGenerics
        if:  env.has_RUnit == 'true'
        run: |
          ## Install BiocGenerics
          BiocManager::install("BiocGenerics")
        shell: Rscript {0}

      - name: Install covr
        if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux'
        run: |
          remotes::install_cran("covr")
        shell: Rscript {0}

      - name: Install pkgdown
        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
        run: |
          remotes::install_cran("pkgdown")
        shell: Rscript {0}

      - name: Session info
        run: |
          options(width = 100)
          pkgs <- installed.packages()[, "Package"]
          sessioninfo::session_info(pkgs, include_base = TRUE)
        shell: Rscript {0}

      - name: Run CMD check
        env:
          _R_CHECK_CRAN_INCOMING_: false
          DISPLAY: 99.0
        run: |
          options(crayon.enabled = TRUE)
          rcmdcheck::rcmdcheck(
              args = c("--no-manual", "--no-vignettes", "--timings"),
              build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
              error_on = "warning",
              check_dir = "check"
          )
        shell: Rscript {0}

      ## Might need an to add this to the if:  && runner.os == 'Linux'
      - name: Reveal testthat details
        if:  env.has_testthat == 'true'
        run: find . -name testthat.Rout -exec cat '{}' ';'

      - name: Run RUnit tests
        if:  env.has_RUnit == 'true'
        run: |
          BiocGenerics:::testPackage()
        shell: Rscript {0}

      - name: Run BiocCheck
        env:
          DISPLAY: 99.0
        run: |
          BiocCheck::BiocCheck(
              dir('check', 'tar.gz$', full.names = TRUE),
              `quit-with-status` = FALSE,
              `no-check-R-ver` = TRUE,
              `no-check-bioc-help` = TRUE
          )
        shell: Rscript {0}

      - name: Test coverage
        if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux'
        run: |
          covr::codecov()
        shell: Rscript {0}

      - name: Install package
        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
        run: R CMD INSTALL .
        
      - name: Get R package info
        if: runner.os == 'Linux'
        run: |
          #### DockerHub repos must be lowercase (,,) ####
          name=$(grep '^Package:' DESCRIPTION | cut -d\   -f2)
          echo "packageNameOrig=${name}" >> $GITHUB_ENV
          echo $name
          version=$(grep Version DESCRIPTION | grep -o "[0-9.]\+")
          echo "packageVersion=${version}" >> $GITHUB_ENV
          echo $version
        shell: bash {0}

      - name: Build and deploy pkgdown site
        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
        run: |
          git config --global --add safe.directory /__w/${{env.packageNameOrig}}/${{env.packageNameOrig}} 
          git config --local user.name "$GITHUB_ACTOR"
          git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
          Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)"
        shell: bash {0}
        ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE)
        ## at least one locally before this will work. This creates the gh-pages
        ## branch (erasing anything you haven't version controlled!) and
        ## makes the git history recognizable by pkgdown.

      - name: Upload check results
        if: failure()
        uses: actions/upload-artifact@master
        with:
          name: ${{ runner.os }}-biocversion-RELEASE_3_14-r-4.1-results
          path: check


================================================
FILE: .gitignore
================================================
.Rproj.user
inst/doc
*.DS_Store
*.pdf
*.Rproj


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
-   repo: https://github.com/lorenzwalthert/precommit
    rev: v0.1.3.9133
    hooks:
    -   id: readme-rmd-rendered # make sure README.Rmd is rendered to README.md
    -   id: parsable-R
        exclude: >
          (?x)^(
          tests/testthat/in/style-files-fail-parse\.R|
          tests/testthat/in/parsable-R-fail\.R|
          )$
    -   id: style-files # style code in the tidyverse style
        args: [--indent_by=4]
        exclude: >
          (?x)^(
          tests/testthat/in/.*\.R|
          renv/.*
          )$
    -   id: deps-in-desc # all dependencies pkg::func are in listed in dec
        args: [--allow_private_imports]
        exclude: >
          (?x)^(
          tests/testthat/in/.*|
          inst/renv-update\.R|
          renv/activate.R|
          vignettes/FAQ\.Rmd|
          )$
    -   id: lintr
        args: [--warn_only]
        verbose: true

-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.0.1
    hooks:
    -   id: check-added-large-files # make sure no large files commited
    -   id: end-of-file-fixer
        exclude: '\.Rd'


================================================
FILE: DESCRIPTION
================================================
Package: ggtranscript
Title: Visualizing Transcript Structure and Annotation using 'ggplot2'
Version: 1.0.0
Authors@R:
    c(
    person("David", "Zhang", , "dyzhang32@gmail.com", 
           role = c("aut", "cre"),
           comment = c(ORCID = "0000-0003-2382-8460")), 
    person("Emil", "Gustavsson", , "e.gustavsson@ucl.ac.uk", role = c("aut"),
           comment = c(ORCID = "0000-0003-0541-7537")),
    person("Regina", "Reynolds", , "regina.reynolds.16@ucl.ac.uk", 
           role = c("ctb"), comment = c(ORCID = "0000-0001-6470-7919")), 
    person("Sonia", "Ruiz", , "s.ruiz@ucl.ac.uk", 
           role = c("ctb"))
    )
Description: The goal of ggtranscript is the simplify the process of visualizing 
    transcript structure and annotation. To achieve this, ggtranscript 
    introduces 5 new geoms (geom_range(), geom_half_range(), geom_intron(), 
    geom_junction() and geom_junction_label_repel()) as well as several helper 
    functions. As a 'ggplot2' extension, ggtranscript inherits 'ggplot2's 
    familiarity and flexibility, enabling users to intuitively adjust 
    aesthetics, parameters, scales etc as well as complement ggtranscript geoms
    with existing 'ggplot2' geoms to create informative, publication-ready 
    plots.
License: MIT + file LICENSE
URL: https://github.com/dzhang32/ggtranscript
BugReports: https://github.com/dzhang32/ggtranscript/issues
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
Suggests: 
    BiocStyle,
    covr,
    ggpubr,
    knitr,
    rmarkdown,
    rtracklayer,
    sessioninfo,
    testthat (>= 3.0.0),
    vdiffr
Config/testthat/edition: 3
VignetteBuilder: knitr
Depends: 
    R (>= 2.10)
LazyData: true
Imports: 
    dplyr,
    GenomicRanges,
    ggplot2,
    magrittr,
    rlang,
    S4Vectors,
    GenomeInfoDb,
    ggrepel
Collate: 
    'add_exon_number.R'
    'add_utr.R'
    'data.R'
    'geom_range.R'
    'geom_half_range.R'
    'geom_intron.R'
    'geom_junction.R'
    'geom_junction_label_repel.R'
    'ggtranscript-package.R'
    'globals.R'
    'shorten_gaps.R'
    'to_diff.R'
    'to_intron.R'
    'utils.R'


================================================
FILE: LICENSE
================================================
YEAR: 2022
COPYRIGHT HOLDER: ggtranscript authors


================================================
FILE: LICENSE.md
================================================
# MIT License

Copyright (c) 2022 ggtranscript authors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: NAMESPACE
================================================
# Generated by roxygen2: do not edit by hand

export(add_exon_number)
export(add_utr)
export(geom_half_range)
export(geom_intron)
export(geom_junction)
export(geom_junction_label_repel)
export(geom_range)
export(shorten_gaps)
export(to_diff)
export(to_intron)
import(ggrepel)
importFrom(ggplot2,aes)
importFrom(magrittr,"%>%")
importFrom(rlang,"%||%")


================================================
FILE: NEWS.md
================================================

# ggtranscript 1.0.0

## Fixes

* Updates `geom`s to work with the latest version of `ggplot2` and `ggrepel` (R version `4.4`). 
* Fixes all unit tests.
* Fixes CI workflow issues and simplifies testing to linux-only.

# ggtranscript 0.99.9

## NEW FEATURES

* Address ggtranscript reviews; update docs with examples of using gtf/bed files, integration with `ggplot2` extensions and add usage of `shorten_gaps()` to README.

# ggtranscript 0.99.8

## NEW FEATURES

* Address CRAN feedback; changing ggplot2 -> 'ggplot2', remove biocViews and contributing, removing Date field in DESCRIPTION.

# ggtranscript 0.99.7

## NEW FEATURES

* Add `cran-comments.md` in preparation for first CRAN submission.
* Update CI to run `R CMD Check` on latest R version (4.2).

# ggtranscript 0.99.6

## NEW FEATURES

* Add `@return` documentation for `geom_*` functions for `BiocCheck`.

# ggtranscript 0.99.5

## NEW FEATURES

* Change branch to naming from main to master to match BBS.

# ggtranscript 0.99.4

## NEW FEATURES

* Change email to UCL email for Bioconductor submission. 

## NEW FEATURES

* Add `add_utr()` for adding UTRs as ranges. This helper function is designed to 
work with `shorten_gaps()`, enabling shortening of gaps whilst visually 
differentiating UTRs from the CDS.
* Allow `to_intron()` to take CDS and UTRs ranges as input. 
* Submit to Bioconductor.

# ggtranscript 0.99.2

## NEW FEATURES

* Add `geom_junction_label_repel()` for labeling junctions (e.g. with counts).
* Add `add_exon_number()` for visualizing the exon number/order.

# ggtranscript 0.99.1

## NEW FEATURES

* Implement base geoms: `geom_range()`, `geom_half_range()`, `geom_intron()`, 
`geom_junction()` and helper functions: `to_intron()`, `to_diff()` and 
`shorten_gaps()`.


================================================
FILE: R/add_exon_number.R
================================================
#' Add exon number
#'
#' `add_exon_number()` adds the exon number (the order the exons are transcribed
#' within each transcript) as a column in `exons`. This can be useful when
#' visualizing long, complex transcript structures, in order to keep track of
#' specific exons of interest.
#'
#' To note, a "strand" column must be present within `exons`. The strand is used
#' to differentiate whether exon numbers should be calculated according to
#' ascending ("+") or descending ("-") genomic co-ordinates. For ambiguous
#' strands ("*"), `add_exon_number()` will be assume the strand be "+".
#'
#' @inheritParams to_diff
#'
#' @return `data.frame()` equivalent to input `exons`, with the additional
#'   column "exon_number".
#'
#' @export
#' @examples
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' # add the exon number for each transcript
#' sod1_exons <- sod1_exons %>% add_exon_number(group_var = "transcript_name")
#'
#' base <- sod1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = to_intron(sod1_exons, "transcript_name"),
#'         strand = "+"
#'     )
#'
#' # it can be useful to annotate exons with their exon number
#' # using ggplot2::geom_text()
#' base +
#'     geom_text(aes(
#'         x = (start + end) / 2, # plot label at midpoint of exon
#'         label = exon_number
#'     ),
#'     size = 3.5,
#'     nudge_y = 0.4
#'     )
#'
#' # Or alternatively, using ggrepel::geom_label_repel()
#' # to separate labels from exons
#' base +
#'     ggrepel::geom_label_repel(ggplot2::aes(
#'         x = (start + end) / 2,
#'         label = exon_number
#'     ),
#'     size = 3.5,
#'     min.segment.length = 0
#'     )
add_exon_number <- function(exons, group_var = NULL) {
    .check_coord_object(exons, check_strand = TRUE)
    .check_group_var(exons, group_var)

    if (!is.null(group_var)) {
        exons <- exons %>% dplyr::group_by_at(.vars = group_var)
    }

    # arrange to make sure order reflects genomic position
    exons <- exons %>%
        dplyr::arrange_at(c(.vars = c(group_var, "start", "end")))

    # add exon number, assuming all plus strand at start
    exons <- exons %>%
        dplyr::mutate(
            exon_number = dplyr::row_number(),
            n_exons = dplyr::n()
        ) %>%
        dplyr::ungroup()

    # convert exon number for minus strand
    exons <- exons %>%
        dplyr::mutate(
            exon_number = ifelse(
                strand == "-",
                n_exons - exon_number + 1,
                exon_number
            )
        ) %>%
        dplyr::select(-n_exons)

    return(exons)
}


================================================
FILE: R/add_utr.R
================================================
#' Add untranslated regions (UTRs)
#'
#' Given a set of `exons` (encompassing the CDS and UTRs) and `cds` regions,
#' `add_utr()` will calculate and add the corresponding UTR regions as ranges.
#' This can be useful when combined with `shorten_gaps()` to visualize
#' transcripts with long introns, whilst differentiating UTRs from CDS regions.
#'
#' The definition of the inputted `cds` regions are expected to range from the
#' beginning of the start codon to the end of the stop codon. Sometimes, for
#' example in the case of Ensembl, reference annotation will omit the stop
#' codons from the CDS definition. In such cases, users should manually ensure
#' that the `cds` includes both the start and stop codons.
#'
#' @inheritParams to_diff
#' @param cds `data.frame()` contains coding sequence ranges for the transcripts
#'   in `exons`.
#'
#' @return `data.frame()` contains differentiated CDS and UTR ranges.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' pknox1_annotation %>% head()
#'
#' # extract exons
#' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
#' pknox1_exons %>% head()
#'
#' # extract cds
#' pknox1_cds <- pknox1_annotation %>% dplyr::filter(type == "CDS")
#' pknox1_cds %>% head()
#'
#' # the CDS definition originating from the Ensembl reference annotation
#' # does not include the stop codon
#' # we must incorporate the stop codons into the CDS manually
#' # by adding 3 base pairs to the end of the CDS of each transcript
#' pknox1_cds_w_stop <- pknox1_cds %>%
#'     dplyr::group_by(transcript_name) %>%
#'     dplyr::mutate(
#'         end = ifelse(end == max(end), end + 3, end)
#'     ) %>%
#'     dplyr::ungroup()
#'
#' # add_utr() adds ranges that represent the UTRs
#' pknox1_cds_utr <- add_utr(
#'     pknox1_exons,
#'     pknox1_cds_w_stop,
#'     group_var = "transcript_name"
#' )
#'
#' pknox1_cds_utr %>% head()
#'
#' # this can be useful when combined with shorten_gaps()
#' # to visualize transcripts with long introns whilst differentiating UTRs
#' pknox1_cds_utr_rescaled <-
#'     shorten_gaps(
#'         exons = pknox1_cds_utr,
#'         introns = to_intron(pknox1_cds_utr, "transcript_name"),
#'         group_var = "transcript_name"
#'     )
#'
#' pknox1_cds_utr_rescaled %>%
#'     dplyr::filter(type == "CDS") %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_range(
#'         data = pknox1_cds_utr_rescaled %>% dplyr::filter(type == "UTR"),
#'         height = 0.25,
#'         fill = "white"
#'     ) +
#'     geom_intron(
#'         data = to_intron(
#'             pknox1_cds_utr_rescaled %>% dplyr::filter(type != "intron"),
#'             "transcript_name"
#'         ),
#'         arrow.min.intron.length = 110
#'     )
add_utr <- function(exons,
                    cds,
                    group_var = NULL) {

    # input checks
    .check_coord_object(exons, check_seqnames = TRUE)
    .check_group_var(exons, group_var)
    .check_coord_object(cds, check_seqnames = TRUE)
    .check_group_var(cds, group_var)

    # we have to create dummy group for downstream for loop if there is no group
    null_group <- is.null(group_var)
    if (null_group) {
        exons <- exons %>% dplyr::mutate(dummy_group = "A")
        cds <- cds %>% dplyr::mutate(dummy_group = "A")
        group_var <- "dummy_group"
    }

    groups <- cds[[group_var]] %>% unique()

    # convert to GenomicRanges for downstream processing
    exons_gr <- exons %>% GenomicRanges::GRanges()
    cds_gr <- cds %>% GenomicRanges::GRanges()

    exons_w_utr <- vector("list", length = length(groups))

    for (i in seq_along(groups)) {
        exons_gr_curr <- exons_gr %>%
            .[GenomicRanges::mcols(exons_gr)[[group_var]] == groups[i]]

        cds_gr_curr <- cds_gr %>%
            .[GenomicRanges::mcols(cds_gr)[[group_var]] == groups[i]]

        # use setdiff to get regions in exon but not in cds (i.e. the utrs)
        utrs_curr <- GenomicRanges::setdiff(exons_gr_curr, cds_gr_curr)
        GenomicRanges::mcols(utrs_curr)[[group_var]] <- groups[i]

        utrs_curr$type <- "UTR"
        cds_gr_curr$type <- "CDS"

        exons_w_utr[[i]] <- c(utrs_curr, cds_gr_curr) %>% sort()
    }

    exons_w_utr <- exons_w_utr %>%
        do.call(c, .) %>%
        as.data.frame() %>%
        dplyr::as_tibble()

    # remove dummp_group if created
    if (null_group) {
        exons_w_utr <- exons_w_utr %>% dplyr::select(-dummy_group)
    }

    return(exons_w_utr)
}


================================================
FILE: R/data.R
================================================
#' Example transcript annotation
#'
#' Transcript annotation including the co-ordinates (hg38) of the genes,
#' transcripts, exons and CDS regions for \emph{SOD1} and \emph{PKNOX1}, which
#' originate from version 105 of the Ensembl reference annotation.
#'
#' @format A `tibble::tibble()`:
#' \describe{
#'   \item{seqnames}{`factor()` chromosome.}
#'   \item{start}{`integer()` start position.}
#'   \item{end}{`integer()` end position.}
#'   \item{strand}{`factor()` strand.}
#'   \item{type}{`factor()` E.g.gene, transcript, exon or CDS.}
#'   \item{gene_name}{`character()` name of gene (GBA).}
#'   \item{transcript_name}{`character()` name of transcript.}
#'   \item{transcript_biotype}{`character()` biotype of transcript.}
#' }
#'
#' @source generated using `ggtranscript/data-raw/sod1_pknox1_annotation.R`
"sod1_annotation"

#' @rdname sod1_annotation
"pknox1_annotation"

#' Example junctions
#'
#' Junction co-ordinates and counts associated with the \emph{SOD1} gene.
#' Junctions counts originate from GTEx liver samples and are downloaded via the
#' Bioconductor package `snapcount`. Only unannotated junctions with a mean
#' count above 0.3 have been retained for this example.
#'
#' @format A `tibble::tibble()`:
#' \describe{
#'   \item{seqnames}{`factor()` chromosome.}
#'   \item{start}{`integer()` start position.}
#'   \item{end}{`integer()` end position.}
#'   \item{strand}{`factor()` strand.}
#'   \item{mean_count}{`factor()` Average count across all GTEx liver samples.}
#' }
#'
#' @source generated using `ggtranscript/data-raw/sod1_junctions.R`
"sod1_junctions"


================================================
FILE: R/geom_half_range.R
================================================
#' @param range.orientation `character()` one of "top" or "bottom", specifying
#'   where the half ranges will be plotted with respect to each transcript
#'   (`y`).
#'
#' @export
#' @rdname geom_range
geom_half_range <- function(mapping = NULL, data = NULL,
                            stat = "identity", position = "identity",
                            ...,
                            range.orientation = "bottom",
                            linejoin = "mitre",
                            na.rm = FALSE,
                            show.legend = NA,
                            inherit.aes = TRUE) {
    ggplot2::layer(
        data = data,
        mapping = mapping,
        stat = stat,
        geom = GeomHalfRange,
        position = position,
        show.legend = show.legend,
        inherit.aes = inherit.aes,
        params = list(
            range.orientation = range.orientation,
            linejoin = linejoin,
            na.rm = na.rm,
            ...
        )
    )
}

#' `GeomHalfRange` is `GeomRange` with default parameters for `vjust` and
#' `height` as well as the added parameter `range.orientation`
#'
#' @include geom_range.R
#' @keywords internal
#' @noRd
GeomHalfRange <- ggplot2::ggproto("GeomHalfRange", GeomRange,
    setup_data = function(data, params) {
        # check that range.orientation is one of possible options
        .check_range.orientation(params)

        # modified from ggplot2::GeomTile
        data$height <- data$height %||% params$height %||% 0.25

        transform(
            data,
            xmin = xstart,
            xmax = xend,
            ymin = y - height / 2,
            ymax = y + height / 2,
            height = NULL
        )
    },
    draw_panel = function(data,
                          panel_params,
                          coord,
                          range.orientation = "bottom",
                          lineend = "butt",
                          linejoin = "mitre") {
        vjust <- ifelse(
            range.orientation == "bottom",
            1.5,
            0.5
        )

        GeomRange$draw_panel(
            data = data,
            panel_params = panel_params,
            coord = coord,
            vjust = vjust,
            lineend = lineend,
            linejoin = linejoin
        )
    }
)

#' @keywords internal
#' @noRd
.check_range.orientation <- function(params) {
    not_orient_option <-
        !(params$range.orientation %in% c("top", "bottom"))

    if (not_orient_option) {
        stop(
            "range.orientation must be one of ",
            "'alternating', 'top' or 'bottom'"
        )
    }
}


================================================
FILE: R/geom_intron.R
================================================
#' Plot intron lines with strand arrows
#'
#' `geom_intron()` draws horizontal lines with central arrows that are designed
#' to represent introns. In combination with `geom_range()`/`geom_half_range()`,
#' these geoms form the core components for visualizing transcript structures.
#'
#' `geom_intron()` requires the following `aes()`; `xstart`, `xend` and `y`
#' (e.g. transcript name). If users do not have intron co-ordinates, these can
#' be generated from the corresponding exons using `to_intron()`. The `strand`
#' option (one of "+" or "-") adjusts the arrow direction to match the direction
#' of transcription. The `arrow.min.intron.length` parameter can be useful to
#' remove strand arrows that overlap exons, which can be a problem if plotted
#' introns include those that are relatively short.
#'
#' @inheritParams ggplot2::layer
#' @inheritParams ggplot2::geom_point
#' @inheritParams ggplot2::geom_segment
#' @param arrow.min.intron.length `integer()` the minimum required width of an
#'   intron for a strand arrow to be drawn. This can be useful to remove strand
#'   arrows on short introns that overlap adjacent exons.
#'
#' @return the return value of a `geom_*` function is not intended to be
#'   directly handled by users. Therefore, `geom_*` functions should never be
#'   executed in isolation, rather used in combination with a
#'   `ggplot2::ggplot()` call.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' pknox1_annotation %>% head()
#'
#' # extract exons
#' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
#' pknox1_exons %>% head()
#'
#' # to_intron() is a helper function included in ggtranscript
#' # which is useful for converting exon co-ordinates to introns
#' pknox1_introns <- pknox1_exons %>% to_intron(group_var = "transcript_name")
#' pknox1_introns %>% head()
#'
#' base <- pknox1_introns %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     ))
#'
#' # by default, geom_intron() assumes introns originate from the "+" strand
#' base + geom_intron()
#'
#' # however this can be modified using the strand option
#' base + geom_intron(strand = "-")
#'
#' # strand can also be set as an aes()
#' base + geom_intron(aes(strand = strand))
#'
#' # as a ggplot2 extension, ggtranscript geoms inherit the
#' # the functionality from the parameters and aesthetics in ggplot2
#' base + geom_intron(
#'     aes(colour = transcript_name),
#'     linewidth = 1
#' )
#'
#' # together, geom_range() and geom_intron() are designed to visualize
#' # the core components of transcript annotation
#' pknox1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = pknox1_introns
#'     )
#'
#' # for short introns, sometimes strand arrows will overlap exons
#' # to avoid this, users can set the arrow.min.intron.length parameter
#' pknox1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = pknox1_introns,
#'         arrow.min.intron.length = 3500
#'     )
geom_intron <- function(mapping = NULL, data = NULL,
                        stat = "identity", position = "identity",
                        ...,
                        arrow = grid::arrow(ends = "last", length = grid::unit(0.1, "inches")),
                        arrow.fill = NULL,
                        lineend = "butt",
                        linejoin = "round",
                        na.rm = FALSE,
                        arrow.min.intron.length = 0,
                        show.legend = NA,
                        inherit.aes = TRUE) {
    ggplot2::layer(
        data = data,
        mapping = mapping,
        stat = stat,
        geom = GeomIntron,
        position = position,
        show.legend = show.legend,
        inherit.aes = inherit.aes,
        params = list(
            arrow = arrow,
            arrow.fill = arrow.fill,
            lineend = lineend,
            linejoin = linejoin,
            na.rm = na.rm,
            arrow.min.intron.length = arrow.min.intron.length,
            ...
        )
    )
}

#' `GeomIntron` is pretty much `ggplot2::GeomSegment` with the `required_aes`
#' changed to `xstart`/`xend` to match genetic nomenclature and the added arrows
#' to indicate direction of transcription (configured with `strand` and
#' `arrow.min.intron.length`)
#' @noRd
GeomIntron <- ggplot2::ggproto("GeomIntron", ggplot2::GeomSegment,
    required_aes = c("xstart", "xend", "y"),
    default_aes = aes(
        colour = "black",
        linewidth = 0.5,
        linetype = 1,
        alpha = NA,
        strand = "+"
    ),
    setup_params = function(data, params) {
        # check that arrow.min.intron.length numeric is >= 0
        arrow.min_numeric <- is.numeric(params$arrow.min.intron.length)
        arrow.min_neg <- params$arrow.min.intron.length < 0

        if (!arrow.min_numeric | arrow.min_neg) {
            stop("arrow.min.intron.length must be a numeric > 0")
        }

        params
    },
    setup_data = function(data, params) {
        # needed to permit usage of xstart/xend
        transform(
            data,
            x = xstart,
            yend = y,
            xstart = NULL
        )
    },
    draw_panel = function(data,
                          panel_params,
                          coord,
                          arrow = NULL,
                          arrow.fill = NULL,
                          lineend = "butt",
                          linejoin = "round",
                          na.rm = FALSE,
                          arrow.min.intron.length = 0) {

        # check that strand is scalar and one of "+" or "-"
        .check_strand(data$strand)

        # first, create the intron grob, which is just a pure line (no arrow)
        intron_grob <- ggplot2::GeomSegment$draw_panel(
            data = data,
            panel_params = panel_params,
            coord = coord,
            arrow = NULL,
            arrow.fill = NULL,
            lineend = lineend,
            linejoin = linejoin,
            na.rm = na.rm
        )

        # then, create the arrow grobs, one per strand
        # need both as the direction of arrow (as far I can tell) is
        # is dependent on the orientation of the x/xend
        strand_arrow_plus_grob <- .create_strand_arrow_grob(
            target_strand = "+",
            arrow.min.intron.length = arrow.min.intron.length,
            data = data,
            panel_params = panel_params,
            coord = coord,
            arrow = arrow,
            arrow.fill = arrow.fill,
            lineend = lineend,
            linejoin = linejoin,
            na.rm = na.rm
        )

        strand_arrow_minus_grob <- .create_strand_arrow_grob(
            target_strand = "-",
            arrow.min.intron.length = arrow.min.intron.length,
            data = data,
            panel_params = panel_params,
            coord = coord,
            arrow = arrow,
            arrow.fill = arrow.fill,
            lineend = lineend,
            linejoin = linejoin,
            na.rm = na.rm
        )

        # draw_panel expects return of a grob
        # here, as we build multiple grobs (i.e. intron lines + arrows)
        # we use a grobTree to combine the two
        grid::grobTree(
            intron_grob,
            strand_arrow_plus_grob,
            strand_arrow_minus_grob
        )
    }
)

#' @keywords internal
#' @noRd
.check_strand <- function(strand) {
    # TODO - add option for "*" arrow?
    any_na <- any(is.na(strand))
    plus_minus <- !(all(strand %in% c("+", "-")))

    if (any_na | plus_minus) {
        stop("strand values must be one of '+' and '-'")
    }

    return(invisible())
}

#' @keywords internal
#' @noRd
.create_strand_arrow_grob <- function(target_strand,
                                      arrow.min.intron.length,
                                      data,
                                      panel_params,
                                      coord,
                                      arrow,
                                      arrow.fill,
                                      lineend,
                                      linejoin,
                                      na.rm) {

    # filter for introns that match target strand
    # and have a length above arrow.min.intron.length
    match_strand <- data$strand == target_strand
    ab_min <- abs(data$x - data$xend) > arrow.min.intron.length
    arrow_data <- data[match_strand & ab_min, ]

    # if there are no arrows to plot, use a nullGrob() to add nothing
    if (nrow(arrow_data) == 0) {
        arrow_grob <- grid::nullGrob()
    } else {

        # obtain the the correct orientation of arrow (dependent on strand)
        # as the arrow can only be placed at either end of a geom_segment/path
        # the strand changes the x/xends around, shifting the around direction
        if (target_strand == "+") {
            arrow_data <- transform(
                arrow_data,
                xend = (x + xend) / 2
            )
        } else {
            arrow_data <- transform(
                arrow_data,
                mid = (x + xend) / 2,
                x = xend
            )
            arrow_data <- transform(
                arrow_data,
                xend = mid
            )
        }

        arrow_grob <- ggplot2::GeomSegment$draw_panel(
            data = arrow_data,
            panel_params = panel_params,
            coord = coord,
            arrow = arrow,
            arrow.fill = arrow.fill,
            lineend = lineend,
            linejoin = linejoin,
            na.rm = na.rm
        )
    }

    return(arrow_grob)
}


================================================
FILE: R/geom_junction.R
================================================
#' Plot junction curves
#'
#' `geom_junction()` draws curves that are designed to represent junction reads
#' from RNA-sequencing data. It can be useful to overlay junction data on
#' transcript annotation (plotted using `geom_range()`/`geom_half_range()` and
#' `geom_intron()`) to understand which splicing events or transcripts have
#' support from RNA-sequencing data.
#'
#' `geom_junction()` requires the following `aes()`; `xstart`, `xend` and `y`
#' (e.g. transcript name). `geom_junction()` curves can be modified using
#' `junction.y.max`, which can be useful when junctions overlap one
#' another/other transcripts or extend beyond the plot margins. By default,
#' junction curves will alternate between being plotted on the top and bottom of
#' each transcript (`y`), however this can be modified via
#' `junction.orientation`.
#'
#' @inheritParams ggplot2::layer
#' @inheritParams ggplot2::geom_bar
#' @inheritParams grid::curveGrob
#' @param junction.orientation `character()` one of "alternating", "top" or
#'   "bottom", specifying where the junctions will be plotted with respect to
#'   each transcript (`y`).
#' @param junction.y.max `double()` the max y-value of each junction curve. It
#'   can be useful to adjust this parameter when junction curves overlap with
#'   one another/other transcripts or extend beyond the plot margins.
#'
#' @return the return value of a `geom_*` function is not intended to be
#'   directly handled by users. Therefore, `geom_*` functions should never be
#'   executed in isolation, rather used in combination with a
#'   `ggplot2::ggplot()` call.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # as well as a set of example (unannotated) junctions
#' # originating from GTEx and downloaded via the Bioconductor package snapcount
#' sod1_junctions
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(
#'     type == "exon",
#'     transcript_name == "SOD1-201"
#' )
#' sod1_exons %>% head()
#'
#' # add transcript_name to junctions for plotting
#' sod1_junctions <- sod1_junctions %>%
#'     dplyr::mutate(transcript_name = "SOD1-201")
#'
#' # junctions can be plotted as curves using geom_junction()
#' base <- sod1_junctions %>%
#'     ggplot2::ggplot(ggplot2::aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     ))
#'
#' # sometimes, depending on the number and widths of transcripts and junctions
#' # junctions will go overlap one another or extend beyond the plot margin
#' base + geom_junction()
#'
#' # in such cases, junction.y.max can be adjusted to modify the max y of curves
#' base + geom_junction(junction.y.max = 0.5)
#'
#' # ncp can be used improve the smoothness of curves
#' base + geom_junction(junction.y.max = 0.5, ncp = 30)
#'
#' # junction.orientation controls where the junction are plotted
#' # with respect to each transcript
#' # either alternating (default), or on the top or bottom
#' base + geom_junction(junction.orientation = "top", junction.y.max = 0.5)
#' base + geom_junction(junction.orientation = "bottom", junction.y.max = 0.5)
#'
#' # it can be useful useful to overlay junction curves onto existing annotation
#' # plotted using geom_range() and geom_intron()
#' base <- sod1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = to_intron(sod1_exons, "transcript_name")
#'     )
#'
#' base + geom_junction(
#'     data = sod1_junctions,
#'     junction.y.max = 0.5
#' )
#'
#' # as a ggplot2 extension, ggtranscript geoms inherit the
#' # the functionality from the parameters and aesthetics in ggplot2
#' # this can be useful when mapping junction thickness to their counts
#' base + geom_junction(
#'     data = sod1_junctions,
#'     aes(linewidth = mean_count),
#'     junction.y.max = 0.5,
#'     colour = "purple"
#' ) +
#'     scale_linewidth(range = c(0.1, 1))
#'
#' # it can be useful to combine geom_junction() with geom_half_range()
#' sod1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_half_range() +
#'     geom_intron(
#'         data = to_intron(sod1_exons, "transcript_name")
#'     ) +
#'     geom_junction(
#'         data = sod1_junctions,
#'         aes(linewidth = mean_count),
#'         junction.y.max = 0.5,
#'         junction.orientation = "top",
#'         colour = "purple"
#'     ) +
#'     scale_linewidth(range = c(0.1, 1))
geom_junction <- function(mapping = NULL,
                          data = NULL,
                          stat = "identity",
                          position = "identity",
                          junction.orientation = "alternating",
                          junction.y.max = 1,
                          angle = 90,
                          ncp = 15,
                          na.rm = FALSE,
                          orientation = NA,
                          show.legend = NA,
                          inherit.aes = TRUE,
                          ...) {
    ggplot2::layer(
        data = data,
        mapping = mapping,
        stat = stat,
        geom = GeomJunction,
        position = position,
        show.legend = show.legend,
        inherit.aes = inherit.aes,
        params = list(
            junction.orientation = junction.orientation,
            junction.y.max = junction.y.max,
            angle = angle,
            ncp = ncp,
            na.rm = na.rm,
            orientation = orientation,
            ...
        )
    )
}

#' @keywords internal
#' @noRd
GeomJunction <- ggplot2::ggproto("GeomJunction", ggplot2::GeomLine,
    required_aes = c("xstart", "xend", "y"),
    setup_data = function(data, params) {
        # check that junction.orientation is length 1 + one of possible options
        .check_junction.orientation(params)
        # check that junction.y.max is length 1 + one of possible options
        .check_junction.y.max(params)

        # we need a unique group id per junction, rather than per transcript
        # similar to spring example from ggplot2 book
        # https://ggplot2-book.org/spring1.html#spring3
        if (is.null(data$group)) {
            data$group <- seq_len(nrow(data))
        }
        if (anyDuplicated(data$group)) {
            data$group <- paste(data$group, seq_len(nrow(data)), sep = "-")
        }

        # needed to permit usage of xstart/xend
        transform(
            data,
            x = xstart,
            xstart = NULL
        )
    },
    draw_panel = function(data,
                          panel_params,
                          coord,
                          junction.orientation = "alternating",
                          junction.y.max = 1,
                          angle = 90,
                          ncp = 15) {
        # junction_index represents the order of each junction within tx
        # needed for junction.orientation = "alternating"
        data <- data %>%
            dplyr::group_by(y) %>%
            dplyr::mutate(junction_index = dplyr::row_number()) %>%
            dplyr::ungroup()

        # obtain the actual curves using grid:::calcControlPoints
        junctions <- .get_junction_curves(data, angle, ncp)

        # normalise curve points to lie between 0-1
        # scale to fit depending on N txs, width of junctions
        junctions <- .get_normalised_curve(
            junctions,
            junction.orientation,
            junction.y.max
        )

        ggplot2::GeomLine$draw_panel(junctions, panel_params, coord)
    }
)

#' @keywords internal
#' @noRd
.get_junction_curves <- function(data, angle, ncp) {

    #  very similar to springs example
    # create the junction points, whilst preserving aes
    # https://ggplot2-book.org/spring1.html#spring3
    # TODO - implementation could probably be vectorised for speed
    cols_to_keep <- setdiff(names(data), c("x", "xend", "y"))
    junctions <- lapply(seq_len(nrow(data)), function(i) {
        junction_curve <- .get_junction_curve(
            data$x[i], data$xend[i], data$y[i],
            angle, ncp
        )
        cbind(junction_curve, unclass(data[i, cols_to_keep]))
    })

    junctions <- do.call(rbind, junctions)

    return(junctions)
}


#' @keywords internal
#' @noRd
.get_junction_curve <- function(x, xend, y, angle, ncp) {
    # creates the points for each curve
    curve_points <- calcControlPoints(
        x1 = x, x2 = xend,
        y1 = y, y2 = y,
        angle = angle,
        curvature = -0.5,
        ncp = ncp
    )

    # need to re-add the original points as these not included
    # by grid:::calcControlPoints
    # makes sure junctions curves meet the intron lines
    junction_curve <- data.frame(
        x_points = c(x, curve_points$x, xend),
        y_points = c(y, curve_points$y, y),
        y_original = y
    ) %>%
        dplyr::rename(
            x = x_points,
            y = y_points
        )

    return(junction_curve)
}

#' @keywords internal
#' @noRd
.get_normalised_curve <- function(junctions,
                                  junction.orientation,
                                  junction.y.max) {

    # junction.y.max is equivalent to the max y of each junction curve
    # each tx is internally uses y an integer
    # scaling factor (sf) is used normalise the junction curve points
    sf <- 1 / junction.y.max

    # each curve point is normalised with relation to the original tx y
    # first divided by the max(y), meaning all y values lie between 0-1
    # then divided by the sf, setting the max y
    if (junction.orientation == "top") {
        junctions <- junctions %>% dplyr::mutate(
            y = ifelse(y == y_original, y, y_original + (y / max(y)) / sf)
        )
    } else if (junction.orientation == "bottom") {
        junctions <- junctions %>% dplyr::mutate(
            y = ifelse(y == y_original, y, y_original - (y / max(y)) / sf)
        )
    } else if (junction.orientation == "alternating") {
        junctions <- junctions %>% dplyr::mutate(y = dplyr::case_when(
            y == y_original ~ y,
            junction_index %% 2 == 0 ~ y_original - (y / max(y) / sf),
            junction_index %% 2 == 1 ~ y_original + (y / max(y) / sf)
        ))
    }

    return(junctions)
}

#' @keywords internal
#' @noRd
.check_junction.orientation <- function(params) {
    not_orient_option <-
        !(params$junction.orientation %in% c("alternating", "top", "bottom"))

    if (not_orient_option) {
        stop(
            "junction.orientation must be one of ",
            "'alternating', 'top' or 'bottom'"
        )
    }
}

#' @keywords internal
#' @noRd
.check_junction.y.max <- function(params) {
    if (length(params$junction.y.max) != 1) {
        stop(
            "junction.y.max must have a length of 1"
        )
    }
    if (!is.numeric(params$junction.y.max)) {
        stop(
            "junction.y.max must be a numeric value (integer/double)"
        )
    }
}

calcControlPoints <- grid:::calcControlPoints


================================================
FILE: R/geom_junction_label_repel.R
================================================
#' Label junction curves
#'
#' `geom_junction_label_repel()` labels junction curves at their midpoint using
#' `ggrepel::geom_label_repel()`. This can be useful to label and compare
#' junctions (plotted using `geom_junction()`) with metrics of their usage (e.g.
#' read counts or percent-spliced-in).
#'
#' `geom_junction_label_repel()` requires the following `aes()`; `xstart`,
#' `xend`, `y` (e.g. transcript name) and `label`. Under the hood,
#' `geom_junction_label_repel()` generates the same junction curves as
#' `geom_junction()` to obtain curve midpoints for labeling. Therefore, it is
#' important that users use the same input data and parameters that alter
#' junction curves (namely `junction.orientation`, `junction.y.max`, `angle`,
#' `ncp`) for `geom_junction_label_repel()` that they have used for
#' `geom_junction()`.
#'
#' @inheritParams ggrepel::geom_text_repel
#' @inheritParams grid::curveGrob
#' @inheritParams geom_junction
#'
#' @return the return value of a `geom_*` function is not intended to be
#'   directly handled by users. Therefore, `geom_*` functions should never be
#'   executed in isolation, rather used in combination with a
#'   `ggplot2::ggplot()` call.
#'
#' @export
#' @examples
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # as well as a set of example (unannotated) junctions
#' # originating from GTEx and downloaded via the Bioconductor package snapcount
#' sod1_junctions
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(
#'     type == "exon",
#'     transcript_name == "SOD1-201"
#' )
#' sod1_exons %>% head()
#'
#' # add transcript_name to junctions for plotting
#' sod1_junctions <- sod1_junctions %>%
#'     dplyr::mutate(transcript_name = "SOD1-201")
#'
#' # geom_junction_label_repel() can be used to label junctions
#' base <- sod1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = to_intron(sod1_exons, "transcript_name")
#'     )
#'
#' # this can be useful to label junctions with their counts
#' base +
#'     geom_junction(
#'         data = sod1_junctions,
#'         junction.y.max = 0.5
#'     ) +
#'     geom_junction_label_repel(
#'         data = sod1_junctions,
#'         aes(label = round(mean_count, 2)),
#'         junction.y.max = 0.5
#'     )
geom_junction_label_repel <- function(mapping = NULL,
                                      data = NULL,
                                      stat = "identity",
                                      position = "identity",
                                      parse = FALSE,
                                      ...,
                                      junction.orientation = "alternating",
                                      junction.y.max = 1,
                                      angle = 90,
                                      ncp = 15,
                                      box.padding = 0.25,
                                      label.padding = 0.25,
                                      point.padding = 1e-6,
                                      label.r = 0.15,
                                      label.size = 0.25,
                                      min.segment.length = 0,
                                      arrow = NULL,
                                      force = 1,
                                      force_pull = 1,
                                      max.time = 0.5,
                                      max.iter = 10000,
                                      max.overlaps = getOption("ggrepel.max.overlaps", default = 10),
                                      nudge_x = 0,
                                      nudge_y = 0,
                                      xlim = c(NA, NA),
                                      ylim = c(NA, NA),
                                      na.rm = FALSE,
                                      show.legend = NA,
                                      direction = c("both", "y", "x"),
                                      seed = NA,
                                      verbose = FALSE,
                                      inherit.aes = TRUE) {
    if (!missing(nudge_x) || !missing(nudge_y)) {
        if (!missing(position)) {
            stop("Specify either `position` or `nudge_x`/`nudge_y`", call. = FALSE)
        }
        position <- position_nudge_repel(nudge_x, nudge_y)
    }
    ggplot2::layer(
        data = data,
        mapping = mapping,
        stat = stat,
        geom = GeomJunctionLabelRepel,
        position = position,
        show.legend = show.legend,
        inherit.aes = inherit.aes,
        params = list(
            parse = parse,
            junction.orientation = junction.orientation,
            junction.y.max = junction.y.max,
            angle = angle,
            ncp = ncp,
            box.padding  = to_unit(box.padding),
            label.padding = to_unit(label.padding),
            point.padding  = to_unit(point.padding),
            label.r = to_unit(label.r),
            label.size = label.size,
            min.segment.length = to_unit(min.segment.length),
            arrow = arrow,
            na.rm = na.rm,
            force = force,
            force_pull = force_pull,
            max.time = max.time,
            max.iter = max.iter,
            max.overlaps = max.overlaps,
            nudge_x = nudge_x,
            nudge_y = nudge_y,
            xlim = xlim,
            ylim = ylim,
            direction = match.arg(direction),
            seed = seed,
            verbose = verbose,
            ...
        )
    )
}

#' @include geom_junction.R
#' @keywords internal
#' @noRd
GeomJunctionLabelRepel <- ggplot2::ggproto(
    "GeomJunctionLabelRepel", ggrepel::GeomLabelRepel,
    required_aes = c("xstart", "xend", "y", "label"),
    # copied from ggrepel::GeomLabelRepel with segment.colour and segment.alpha
    # defaults set to appropriate values, rather than NULL
    # this avoid warnings e.g. Unknown or uninitialised column: `segment.alpha`
    # but does cause issues when setting e.g. aes(colour = tx)
    # TODO - resolve either warning or make segment.colour borrow colour aes
    default_aes = aes(
        colour = "black",
        fill = "white",
        size = 3.88,
        angle = 0,
        alpha = NA,
        family = "",
        fontface = 1,
        lineheight = 1.2,
        hjust = 0.5,
        vjust = 0.5,
        point.size = 1,
        segment.linetype = 1,
        segment.colour = "black",
        segment.size = 0.5,
        segment.alpha = NA,
        segment.curvature = 0,
        segment.angle = 90,
        segment.ncp = 1,
        segment.shape = 0.5,
        segment.square = TRUE,
        segment.squareShape = 1,
        segment.inflect = FALSE,
        segment.debug = FALSE
    ),
    setup_data = GeomJunction$setup_data,
    draw_panel = function(data, panel_scales, coord,
                          parse = FALSE,
                          na.rm = FALSE,
                          junction.orientation = "alternating",
                          junction.y.max = 1,
                          angle = 90,
                          ncp = 15,
                          box.padding = 0.25,
                          label.padding = 0.25,
                          point.padding = 1e-6,
                          label.r = 0.15,
                          label.size = 0.25,
                          min.segment.length = 0,
                          arrow = NULL,
                          force = 1,
                          force_pull = 1,
                          max.time = 0.5,
                          max.iter = 10000,
                          max.overlaps = 10,
                          nudge_x = 0,
                          nudge_y = 0,
                          xlim = c(NA, NA),
                          ylim = c(NA, NA),
                          direction = "both",
                          seed = NA,
                          verbose = FALSE) {

        # junction_index represents the order of each junction within tx
        # needed for junction.orientation = "alternating"
        data <- data %>%
            dplyr::group_by(y) %>%
            dplyr::mutate(junction_index = dplyr::row_number()) %>%
            dplyr::ungroup()

        # obtain the midpoints of junction curves (where we want label)
        junction_midpoints <-
            to_junction_midpoints(
                data,
                angle,
                ncp,
                junction.orientation,
                junction.y.max
            )

        ggrepel::GeomLabelRepel$draw_panel(
            data = junction_midpoints,
            panel_scales = panel_scales,
            coord = coord,
            parse = parse,
            na.rm = na.rm,
            box.padding = box.padding,
            label.padding = label.padding,
            point.padding = point.padding,
            label.r = label.r,
            label.size = label.size,
            min.segment.length = min.segment.length,
            arrow = arrow,
            force = force,
            force_pull = force_pull,
            max.time = max.time,
            max.iter = max.iter,
            max.overlaps = max.overlaps,
            nudge_x = nudge_x,
            nudge_y = nudge_y,
            xlim = xlim,
            ylim = ylim,
            direction = direction,
            seed = seed,
            verbose = verbose
        )
    }
)

#' Wrapper for obtaining junction curve midpoints
#'
#' @keywords internal
#' @noRd
to_junction_midpoints <- function(data,
                                  angle,
                                  ncp,
                                  junction.orientation,
                                  junction.y.max) {
    # TODO - maybe export this as helper?
    junctions <- .get_junction_curves(data, angle, ncp)
    junctions <- .get_normalised_curve(
        junctions,
        junction.orientation,
        junction.y.max
    )
    junction_midpoints <- .get_curve_midpoints(junctions)

    return(junction_midpoints)
}

#' @keywords internal
#' @noRd
.get_curve_midpoints <- function(junctions) {

    # get the mid points of each curve for labeling junctions
    # these are the points with the x value closest to median(x)
    # this cannot be == median(x), this will not pick up point for even ncp's
    junctions_mid <- junctions %>%
        dplyr::group_by(group) %>%
        dplyr::mutate(
            median_x = stats::median(x),
            median_diff = abs(x - median_x)
        ) %>%
        dplyr::filter(median_diff == min(median_diff)) %>%
        dplyr::ungroup() %>%
        dplyr::select(-median_x, -median_diff)

    return(junctions_mid)
}


to_unit <- ggrepel:::to_unit


================================================
FILE: R/geom_range.R
================================================
#' Plot genomic ranges
#'
#' `geom_range()` and `geom_half_range()` draw tiles that are designed to
#' represent range-based genomic features, such as exons. In combination with
#' `geom_intron()`, these geoms form the core components for visualizing
#' transcript structures.
#'
#' `geom_range()` and `geom_half_range()` require the following `aes()`;
#' `xstart`, `xend` and `y` (e.g. transcript name). `geom_half_range()` takes
#' advantage of the vertical symmetry of transcript annotation by plotting only
#' half of a range on the top or bottom of a transcript structure. This can be
#' useful for comparing between two transcripts or free up plotting space for
#' other transcript annotations (e.g. `geom_junction()`).
#'
#' @inheritParams ggplot2::layer
#' @inheritParams ggplot2::geom_point
#' @inheritParams ggplot2::geom_tile
#' @inheritParams ggplot2::geom_segment
#' @inheritParams grid::rectGrob
#'
#' @return the return value of a `geom_*` function is not intended to be
#'   directly handled by users. Therefore, `geom_*` functions should never be
#'   executed in isolation, rather used in combination with a
#'   `ggplot2::ggplot()` call.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' base <- sod1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     ))
#'
#' # geom_range() is designed to visualise range-based annotation such as exons
#' base + geom_range()
#'
#' # geom_half_range() allows users to plot half ranges
#' # on the top or bottom of the transcript
#' base + geom_half_range()
#'
#' # where the half ranges are plotted can be adjusted using range.orientation
#' base + geom_half_range(range.orientation = "top")
#'
#' # as a ggplot2 extension, ggtranscript geoms inherit the
#' # the functionality from the parameters and aesthetics in ggplot2
#' base + geom_range(
#'     aes(fill = transcript_name),
#'     linewidth = 1
#' )
#'
#' # together, geom_range() and geom_intron() are designed to visualize
#' # the core components of transcript annotation
#' base + geom_range(
#'     aes(fill = transcript_biotype)
#' ) +
#'     geom_intron(
#'         data = to_intron(sod1_exons, "transcript_name")
#'     )
#'
#' # for protein coding transcripts
#' # geom_range() be useful for visualizing UTRs that lie outside of the CDS
#' sod1_exons_prot_coding <- sod1_exons %>%
#'     dplyr::filter(transcript_biotype == "protein_coding")
#'
#' # extract cds
#' sod1_cds <- sod1_annotation %>%
#'     dplyr::filter(type == "CDS")
#'
#' sod1_exons_prot_coding %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range(
#'         fill = "white",
#'         height = 0.25
#'     ) +
#'     geom_range(
#'         data = sod1_cds
#'     ) +
#'     geom_intron(
#'         data = to_intron(sod1_exons_prot_coding, "transcript_name")
#'     )
#'
#' # geom_half_range() can be useful for comparing between two transcripts
#' # enabling visualization of one transcript on the top, other on the bottom
#' sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
#' sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201")
#' sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202")
#' sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202")
#'
#' sod1_201_plot <- sod1_201_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = "SOD1-201/202"
#'     )) +
#'     geom_half_range(
#'         fill = "white",
#'         height = 0.125
#'     ) +
#'     geom_half_range(
#'         data = sod1_201_cds
#'     ) +
#'     geom_intron(
#'         data = to_intron(sod1_201_exons, "transcript_name")
#'     )
#'
#' sod1_201_plot
#'
#' sod1_201_202_plot <- sod1_201_plot +
#'     geom_half_range(
#'         data = sod1_202_exons,
#'         range.orientation = "top",
#'         fill = "white",
#'         height = 0.125
#'     ) +
#'     geom_half_range(
#'         data = sod1_202_cds,
#'         range.orientation = "top",
#'         fill = "purple"
#'     ) +
#'     geom_intron(
#'         data = to_intron(sod1_202_exons, "transcript_name")
#'     )
#'
#' sod1_201_202_plot
#'
#' # leveraging existing ggplot2 functionality via e.g. coord_cartesian()
#' # can be useful to zoom in on areas of interest
#' sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000))
geom_range <- function(mapping = NULL, data = NULL,
                       stat = "identity", position = "identity",
                       ...,
                       vjust = NULL,
                       linejoin = "mitre",
                       na.rm = FALSE,
                       show.legend = NA,
                       inherit.aes = TRUE) {
    ggplot2::layer(
        data = data,
        mapping = mapping,
        stat = stat,
        geom = GeomRange,
        position = position,
        show.legend = show.legend,
        inherit.aes = inherit.aes,
        params = list(
            vjust = vjust,
            linejoin = linejoin,
            na.rm = na.rm,
            ...
        )
    )
}

#' `GeomRange` is `ggplot2::GeomTile` with modified `aes` to match genetic
#' nomenclature (`xstart`/`xend`)
#' @keywords internal
#' @noRd
GeomRange <- ggplot2::ggproto("GeomRange", ggplot2::GeomTile,
    required_aes = c("xstart", "xend", "y"),
    default_aes = aes(
        fill = "grey",
        colour = "black",
        linewidth = 0.25,
        linetype = 1,
        alpha = NA,
        height = NA
    ),
    setup_data = function(data, params) {
        # modified from ggplot2::GeomTile
        data$height <- data$height %||% params$height %||% 0.5

        transform(
            data,
            xmin = xstart,
            xmax = xend,
            ymin = y - height / 2,
            ymax = y + height / 2,
            height = NULL
        )
    },
    draw_panel = function(self,
                          data,
                          panel_params,
                          coord,
                          vjust = NULL,
                          lineend = "butt",
                          linejoin = "mitre") {
        if (!coord$is_linear()) {
            # prefer to match geom_curve and warn
            # rather than copy the implementation from GeomRect for simplicity
            # also don'think geom_range would be used for non-linear coords
            warn("geom_ is not implemented for non-linear coordinates")
        }

        coords <- coord$transform(data, panel_params)
        grid::rectGrob(
            coords$xmin, coords$ymax,
            width = coords$xmax - coords$xmin,
            height = coords$ymax - coords$ymin,
            default.units = "native",
            just = c("left", "top"),
            vjust = vjust,
            gp = grid::gpar(
                col = coords$colour,
                fill = ggplot2::alpha(coords$fill, coords$alpha),
                lwd = coords$linewidth * ggplot2::.pt,
                lty = coords$linetype,
                linejoin = linejoin,
                lineend = lineend
            )
        )
    }
)


================================================
FILE: R/ggtranscript-package.R
================================================
#' `ggtranscript`: Visualizing transcript structure and annotation using
#' `ggplot2`
#'
#' The goal of `ggtranscript` is the simplify the process of visualizing
#' transcript structure and annotation. To achieve this, `ggtranscript`
#' introduces 5 new geoms (`geom_range()`, `geom_half_range()`, `geom_intron()`,
#' `geom_junction()` and `geom_junction_label_repel()`) as well as several
#' helper functions. As a `ggplot2` extension, `ggtranscript` inherits
#' `ggplot2`'s familiarity and flexibility, enabling users to intuitively adjust
#' aesthetics, parameters, scales etc as well as complement `ggtranscript` geoms
#' with existing `ggplot2` geoms to create informative, publication-ready plots.
#'
#' @docType package
#' @name ggtranscript
"_PACKAGE"

#' @importFrom rlang %||%
#' @importFrom magrittr %>%
#' @importFrom ggplot2 aes
#' @import ggrepel
NULL


================================================
FILE: R/globals.R
================================================
# bypass R CMD Check notes, related to tidyverse non-standard evaluation
# https://www.r-bloggers.com/2019/08/no-visible-binding-for-global-variable/
utils::globalVariables(c(
    "x",
    "start",
    "end",
    ":=",
    "intron_start",
    "intron_end",
    "exons",
    "xend",
    "mid",
    "index",
    "diff_type",
    "in_exons",
    "in_ref_exons",
    ".",
    # shorten_gaps()
    "width",
    "rescaled_start",
    "rescaled_end",
    "width_tx_start",
    "seqnames",
    "strand",
    "shorten_type",
    "gap_width",
    "shortened_gap_width",
    "shortened_gap_diff",
    "sum_shortened_gap_diff",
    "intron_indexes",
    "shortened_width",
    "type",
    # add_exon_number()
    "exon_number",
    "n_exons",
    # geom_junction_label_repel()
    "group",
    "median",
    "median_x",
    "median_diff",
    "x_points",
    "y_points",
    "y",
    "y_original",
    "position_nudge_repel",
    # add_utr(),
    "dummy_group"
))


================================================
FILE: R/shorten_gaps.R
================================================
#' Improve transcript structure visualization by shortening gaps
#'
#' For a given set of exons and introns, `shorten_gaps()` reduces the width of
#' gaps (regions that do not overlap any `exons`) to a user-inputted
#' `target_gap_width`. This can be useful when visualizing transcripts that have
#' long introns, to hone in on the regions of interest (i.e. exons) and better
#' compare between transcript structures.
#'
#' After `shorten_gaps()` reduces the size of gaps, it will re-scale `exons` and
#' `introns` to preserve exon alignment. This process will only reduce the width
#' of input `introns`, never `exons`. Importantly, the outputted re-scaled
#' co-ordinates should only be used for visualization as they will not match the
#' original genomic coordinates.
#'
#' @inheritParams to_diff
#' @param introns `data.frame()` the intron co-ordinates corresponding to the
#'   input `exons`. This can be created by applying `to_intron()` to the
#'   `exons`. If introns originate from multiple transcripts, they must be
#'   differentiated using `group_var`. If a user is not using `to_intron()`,
#'   they must make sure intron start/ends are defined precisely as the adjacent
#'   exon boundaries (rather than exon end + 1 and exon start - 1).
#' @param target_gap_width `integer()` the width in base pairs to shorten the
#'   gaps to.
#'
#' @return `data.frame()` contains the re-scaled co-ordinates of `introns` and
#'   `exons` of each input transcript with shortened gaps.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' pknox1_annotation %>% head()
#'
#' # extract exons
#' pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
#' pknox1_exons %>% head()
#'
#' # to_intron() is a helper function included in ggtranscript
#' # which is useful for converting exon co-ordinates to introns
#' pknox1_introns <- pknox1_exons %>% to_intron(group_var = "transcript_name")
#' pknox1_introns %>% head()
#'
#' # for transcripts with long introns, the exons of interest
#' # can be difficult to visualize clearly when using the default scale
#' pknox1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = pknox1_introns,
#'         arrow.min.intron.length = 3500
#'     )
#'
#' # in such cases it can be useful to rescale the exons and introns
#' # using shorten_gaps() which shortens regions that do not overlap an exon
#' pknox1_rescaled <-
#'     shorten_gaps(pknox1_exons, pknox1_introns, group_var = "transcript_name")
#'
#' pknox1_rescaled %>% head()
#'
#' # this allows us to visualize differences in exonic structure more clearly
#' pknox1_rescaled %>%
#'     dplyr::filter(type == "exon") %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = pknox1_rescaled %>% dplyr::filter(type == "intron"),
#'         arrow.min.intron.length = 300
#'     )
#'
#' # shorten_gaps() can be used in combination with to_diff()
#' # to further highlight differences in exon structure
#' # here, all other transcripts are compared to the MANE-select transcript
#' pknox1_rescaled_diffs <- to_diff(
#'     exons = pknox1_rescaled %>%
#'         dplyr::filter(type == "exon", transcript_name != "PKNOX1-201"),
#'     ref_exons = pknox1_rescaled %>%
#'         dplyr::filter(type == "exon", transcript_name == "PKNOX1-201"),
#'     group_var = "transcript_name"
#' )
#'
#' pknox1_rescaled %>%
#'     dplyr::filter(type == "exon") %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = pknox1_rescaled %>% dplyr::filter(type == "intron"),
#'         arrow.min.intron.length = 300
#'     ) +
#'     geom_range(
#'         data = pknox1_rescaled_diffs,
#'         aes(fill = diff_type),
#'         alpha = 0.2
#'     )
shorten_gaps <- function(exons,
                         introns,
                         group_var = NULL,
                         target_gap_width = 100L) {

    # input checks
    .check_coord_object(exons, check_seqnames = TRUE, check_strand = TRUE)
    .check_coord_object(introns, check_seqnames = TRUE, check_strand = TRUE)
    .check_group_var(exons, group_var)
    .check_group_var(introns, group_var)
    target_gap_width <- .check_target_gap_width(target_gap_width)

    # check type column, create if not present
    exons <- .get_type(exons, "exons")
    introns <- .get_type(introns, "introns")

    # to_intron() defines introns using the exon boundaries
    # we need to convert this to the actual gap definition to make sure
    # comparison to GenomicRanges::gaps() when using "equal" works correctly
    # this is converted back in .get_rescaled_txs()
    introns <- introns %>%
        dplyr::mutate(
            start = start + 1,
            end = end - 1
        )

    # we use GenomicRanges methods for downstream processing
    exons_gr <- GenomicRanges::GRanges(exons)
    introns_gr <- GenomicRanges::GRanges(introns)

    # obtain actual gaps, i.e. regions that overlap no exons
    intron_gaps <- .get_gaps(exons_gr)

    # by mapping gaps back to introns, we can then shorten overlapping gaps
    gap_map_intron <- .get_gap_map(introns_gr, intron_gaps)
    introns_shortened <- .get_shortened_gaps(
        introns,
        intron_gaps,
        gap_map_intron,
        group_var,
        target_gap_width
    )

    # don't have to take tx_start_gaps into account if only 1 tx
    if (!is.null(group_var)) {
        # because we're shortening intron_gaps, we also need to shorten the
        # region from start of the plot and start of each tx (tx_start_gaps)
        tx_start_gaps <- .get_tx_start_gaps(exons, group_var)
        gap_map_tx_start_gaps <- .get_gap_map(
            tx_start_gaps %>% GenomicRanges::GRanges(),
            intron_gaps
        )
        tx_start_gaps_shortened <- .get_shortened_gaps(
            tx_start_gaps,
            intron_gaps,
            gap_map_tx_start_gaps,
            group_var,
            target_gap_width
        ) %>%
            dplyr::select(-start, -end, -strand, -seqnames, -strand)
    }

    rescaled_tx <- .get_rescaled_txs(
        exons,
        introns_shortened,
        tx_start_gaps_shortened,
        group_var
    )

    return(rescaled_tx)
}

#' Add a type column if it is not present already
#'
#' @keywords internal
#' @noRd
.get_type <- function(x, exons_introns) {
    if (!is.null(x[["type"]])) {
        # if there is an existing type column for introns
        # need to make sure this is all "intron" for downstream functions
        # don't check for exons, as this can be variable (e.g. "five_prime_utr")

        if (exons_introns == "introns") {
            allowed_types <- "intron"

            if (!all(x[["type"]] %in% allowed_types)) {
                stop(
                    "values in the 'type' column of ", exons_introns, " must be one of: ",
                    allowed_types %>% paste0("'", ., "'") %>% paste(collapse = ", ")
                )
            }
        }
    } else {
        # if there isn't, we add a default type column
        default_type <- ifelse(exons_introns == "exons", "exon", "intron")

        x <- x %>% dplyr::mutate(type = default_type)
    }

    return(x)
}

#' @keywords internal
#' @noRd
.get_gaps <- function(exons_gr) {
    orig_seqnames <- exons_gr %>%
        GenomicRanges::seqnames() %>%
        as.character() %>%
        unique()

    orig_strand <- exons_gr %>%
        GenomicRanges::strand() %>%
        as.character() %>%
        unique()

    # make sure we only have exons from a single transcript
    .check_len_1_strand_seqnames(orig_seqnames, orig_strand)

    # "reduce" exons - here meaning to collapse into single meta transcript
    exons_gr_reduced <- exons_gr %>% GenomicRanges::reduce()

    # keep only the relevant seqnames, otherwise gaps includes all seqlevels
    GenomeInfoDb::seqlevels(exons_gr_reduced, pruning.mode = "coarse") <-
        orig_seqnames

    # obtain intronic gaps of the meta transcript
    intron_gaps <- exons_gr_reduced %>%
        GenomicRanges::gaps(
            start = min(GenomicRanges::start(exons_gr_reduced)),
            end = max(GenomicRanges::end(exons_gr_reduced))
        )

    # gaps creates a gap per strand too, keep only those from the original strand
    intron_gaps <- intron_gaps %>%
        .[GenomicRanges::strand(intron_gaps) == orig_strand]

    return(intron_gaps)
}

#' @keywords internal
#' @noRd
.get_tx_start_gaps <- function(exons, group_var) {

    # need to scale the transcript starts so scaled introns/exons align
    # importantly, this tx start also has to take into account
    # whether intron_gaps that overlap it have been shortened
    # here, get the tx_start_gap - the region between
    # 1. the start of plot (smallest start position of all txs)
    # 2. the start of each tx
    tx_start_gaps <-
        exons %>%
        dplyr::group_by_at(.vars = c(
            group_var
        )) %>%
        dplyr::summarise(
            seqnames = unique(seqnames),
            strand = unique(strand),
            end = min(start), # min start of this transcript
            start = min(exons[["start"]]) # min start of all transcripts
        )

    return(tx_start_gaps)
}

#' map the gaps back to introns/transcript start gaps
#' @keywords internal
#' @noRd
.get_gap_map <- function(y, intron_gaps) {

    # when we reduce the length of the intron_gaps, whilst making sure
    # whilst making sure the exons/introns remain aligned
    # to do this, we need to map the intron_gaps back onto the introns

    # the simplest case is when gaps are identical to original introns
    equal_hits <- GenomicRanges::findOverlaps(
        intron_gaps,
        y,
        type = "equal"
    )

    # often, the intron_gaps don't map identically
    # this occurs due to the exons of one tx overlapping the intron of another
    # we find cases when the gaps are completely contained an original intron
    # using type = "within", but this also catches the "equal" intron_gaps
    within_hits <- GenomicRanges::findOverlaps(
        intron_gaps,
        y,
        type = "within"
    )

    # convert to data.frame() to use dplyr::anti_join()
    equal_hits <- equal_hits %>% as.data.frame()
    within_hits <- within_hits %>% as.data.frame()

    # remove the "equal" hits from the "within"
    pure_within_hits <- within_hits %>%
        dplyr::anti_join(equal_hits, by = c("queryHits", "subjectHits"))

    # need both "equal" and "pure_within" hits
    gap_map <- list(
        equal = equal_hits,
        pure_within = pure_within_hits
    )

    return(gap_map)
}

#' @keywords internal
#' @noRd
.get_shortened_gaps <- function(y,
                                intron_gaps,
                                gap_map,
                                group_var,
                                target_gap_width) {

    # we need the intron/tx_start_gap widths (to shorten them)
    y <- y %>% dplyr::mutate(width = (end - start) + 1)

    # characterise introns by shortening type
    y_shortened <- y %>%
        dplyr::mutate(
            shorten_type = dplyr::case_when(
                dplyr::row_number() %in% gap_map[["equal"]][["subjectHits"]] ~
                "equal",
                dplyr::row_number() %in% gap_map[["pure_within"]][["subjectHits"]] ~
                "pure_within",
                TRUE ~ "none"
            )
        )

    # for the "equal" cases, simply shorten the widths to the target_gap_width
    y_shortened <- y_shortened %>%
        dplyr::mutate(
            shortened_width = ifelse(
                (shorten_type == "equal") & (width > target_gap_width),
                target_gap_width,
                width
            )
        )

    # for the "within" cases we need to shorten the intron widths
    # by the !total! amount the overlapping gaps are shortened
    overlapping_gap_indexes <- gap_map[["pure_within"]][["queryHits"]]

    # only have to this if there are gaps that are "pure_within"
    if (length(overlapping_gap_indexes) > 0) {

        # one intron may overlap multiple gaps
        # first, calculate the sum of the reduction in gap widths
        sum_gap_diff <- dplyr::tibble(
            intron_indexes = gap_map[["pure_within"]][["subjectHits"]],
            gap_width = GenomicRanges::width(intron_gaps)[overlapping_gap_indexes]
        ) %>%
            dplyr::mutate(
                shortened_gap_width = ifelse(
                    gap_width > target_gap_width,
                    target_gap_width,
                    gap_width
                ),
                shortened_gap_diff = gap_width - shortened_gap_width,
            ) %>%
            dplyr::group_by(intron_indexes) %>%
            dplyr::summarise(
                sum_shortened_gap_diff = sum(shortened_gap_diff)
            )

        # now actually do reduction for introns with "pure_within" gaps
        y_shortened[["sum_shortened_gap_diff"]] <- NA_integer_

        y_shortened[["sum_shortened_gap_diff"]][sum_gap_diff[["intron_indexes"]]] <-
            sum_gap_diff[["sum_shortened_gap_diff"]]

        y_shortened <- y_shortened %>%
            dplyr::mutate(
                shortened_width = ifelse(
                    is.na(sum_shortened_gap_diff),
                    shortened_width,
                    width - sum_shortened_gap_diff
                )
            ) %>%
            dplyr::select(-sum_shortened_gap_diff)
    }

    # remove unecessary intermediate cols
    y_shortened <- y_shortened %>%
        dplyr::select(
            -shorten_type,
            -width,
            width = shortened_width
        )

    return(y_shortened)
}

#' @keywords internal
#' @noRd
.get_rescaled_txs <- function(exons,
                              introns_shortened,
                              tx_start_gaps_shortened,
                              group_var) {

    # calculate the rescaled exon/intron start/ends using
    # the widths of the exons and reduced introns
    rescaled_tx <- exons %>% dplyr::mutate(
        width = (end - start) + 1
    )

    # bind together exons and introns and arrange into genomic order
    rescaled_tx <- rescaled_tx %>%
        dplyr::bind_rows(
            introns_shortened
        ) %>%
        dplyr::arrange_at(.vars = c(group_var, "start", "end"))

    # calculate the rescaled coords using cumsum of the widths of introns/exons
    rescaled_tx <- rescaled_tx %>%
        dplyr::group_by_at(.vars = c(
            group_var
        )) %>%
        dplyr::mutate(
            rescaled_end = cumsum(width),
            rescaled_start = rescaled_end - (width - 1)
        ) %>%
        dplyr::ungroup()

    # account for the tx starts being in different places
    # to keep everything aligned
    if (is.null(group_var)) {
        # if only 1 tx, we use 1 as the dummy rescaled tx_start
        rescaled_tx <- rescaled_tx %>%
            dplyr::mutate(width_tx_start = 1)
    } else {
        rescaled_tx <- rescaled_tx %>%
            dplyr::left_join(
                tx_start_gaps_shortened,
                by = c(group_var),
                suffix = c("", "_tx_start")
            )
    }

    rescaled_tx <- rescaled_tx %>%
        dplyr::mutate(
            rescaled_end = rescaled_end + width_tx_start,
            rescaled_start = rescaled_start + width_tx_start
        ) %>%
        dplyr::select(-dplyr::contains("width"))

    # convert introns back to be defined by exon boundaries, match to_intron()
    rescaled_tx <- rescaled_tx %>%
        dplyr::mutate(
            start = ifelse(type == "intron", start - 1, start),
            end = ifelse(type == "intron", end + 1, end),
            rescaled_start = ifelse(
                type == "intron", rescaled_start - 1, rescaled_start
            ),
            rescaled_end = ifelse(
                type == "intron", rescaled_end + 1, rescaled_end
            )
        )

    # remove original start/end
    rescaled_tx <- rescaled_tx %>% dplyr::select(-start, -end)

    rescaled_tx <- rescaled_tx %>%
        dplyr::select(
            seqnames,
            start = rescaled_start,
            end = rescaled_end,
            strand,
            dplyr::everything()
        )

    return(rescaled_tx)
}

#' we expect the exons to originate from a single gene.
#' therefore, unique strand and seqnames should be of length 1
#' @keywords internal
#' @noRd
.check_len_1_strand_seqnames <- function(orig_seqnames, orig_strand) {
    ab_1_uniq <- "of object contains more than 1 unique value. "
    reason <- "object is expected to contain exons from a single gene."

    if (length(orig_seqnames) != 1) {
        stop("seqnames ", ab_1_uniq, reason)
    }

    if (length(orig_strand) != 1) {
        stop("strand ", ab_1_uniq, reason)
    }
}

#' @keywords internal
#' @noRd
.check_target_gap_width <- function(target_gap_width) {
    if (!is.integer(target_gap_width)) {
        warning("target_gap_width must be an integer, coercing...")
        target_gap_width <- target_gap_width %>%
            as.integer()
    }

    return(target_gap_width)
}


================================================
FILE: R/to_diff.R
================================================
#' Obtain the differences between transcript structure
#'
#' `to_diff()` obtains the difference between `exons` from a set of transcripts
#' to a reference transcript (`ref_exons`). This can be useful when visualizing
#' the differences between transcript structure. `to_diff()` expects two sets of
#' input exons; 1. `exons` - exons from any number of transcripts that will be
#' compared to `ref_exons` and 2. `ref_exons` - exons from a single transcript
#' which acts as the reference to compare against.
#'
#' @param exons `data.frame()` contains exons which can originate from multiple
#'   transcripts differentiated by `group_var`.
#' @param ref_exons `data.frame()` contains exons that originate from a single
#'   transcript, which `exons` will be compared against.
#' @param group_var `character()` if input data originates from more than 1
#'   transcript, `group_var` must specify the column that differentiates
#'   transcripts (e.g. "transcript_id").
#'
#' @return `data.frame()` details the differences between `exons` and
#'   `ref_exons`.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' # for this example, let's compare transcripts to the MANE-select transcript
#' sod1_mane <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
#' sod1_not_mane <- sod1_exons %>% dplyr::filter(transcript_name != "SOD1-201")
#'
#' # to_diff() obtains the differences between the exons as ranges
#' sod1_diffs <- to_diff(
#'     exons = sod1_not_mane,
#'     ref_exons = sod1_mane,
#'     group_var = "transcript_name"
#' )
#'
#' sod1_diffs %>% head()
#'
#' # using geom_range(), it can be useful to visually overlay
#' # the differences on top of the transcript annotation
#' sod1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = to_intron(sod1_exons, "transcript_name")
#'     ) +
#'     geom_range(
#'         data = sod1_diffs,
#'         ggplot2::aes(fill = diff_type),
#'         alpha = 0.2
#'     )
to_diff <- function(exons, ref_exons, group_var = NULL) {
    .check_coord_object(exons, check_seqnames = TRUE, check_strand = TRUE)
    .check_coord_object(ref_exons, check_seqnames = TRUE, check_strand = TRUE)
    .check_group_var(exons, group_var)

    # need to remember if group is NULL for downstream
    null_group <- is.null(group_var)

    # we have to create dummy group if there is no group for .get_diff
    if (null_group) {
        exons <- exons %>% dplyr::mutate(dummy_group = "A")
        group_var <- "dummy_group"
    }

    diffs <- .get_diff(exons, ref_exons, group_var)

    # remove the dummy_group if created
    if (null_group) diffs[[group_var]] <- NULL

    return(diffs)
}

#' The heavy lifting of `to_diff()` happens here.
#'
#' @keywords internal
#' @noRd
.get_diff <- function(exons, ref_exons, group_var) {
    groups <- exons[[group_var]] %>% unique()

    # needs to be a genomic range for downstream processing
    exons_gr <- GenomicRanges::GRanges(exons)
    ref_exons_gr <- GenomicRanges::GRanges(ref_exons)

    diffs <- vector("list", length = length(group_var))

    for (i in seq_along(groups)) {
        exons_gr_curr <- exons_gr %>%
            .[GenomicRanges::mcols(exons_gr)[[group_var]] == groups[i]]

        # get the disjoint pieces (flattening and breaking apart exons)
        disjoint_pieces <- GenomicRanges::disjoin(
            c(ref_exons_gr, exons_gr_curr)
        )

        # find whether the disjoint pieces overlap exons or ref_exons
        # those that only overlap 1 are the differences
        # TODO - perhaps allow modification of findOverlaps() via ... ?
        overlap_exons <- GenomicRanges::findOverlaps(
            disjoint_pieces, exons_gr_curr
        )
        overlap_ref_exons <- GenomicRanges::findOverlaps(
            disjoint_pieces, ref_exons_gr
        )

        # convert pieces back to data.frame and classify diffs
        # TODO - could improve efficiency by placing this step post-loop
        # i.e. manipulate the grs instead
        diff_curr <- disjoint_pieces %>%
            as.data.frame() %>%
            dplyr::mutate(
                index = dplyr::row_number(),
                type = "diff",
                in_exons = index %in% S4Vectors::queryHits(overlap_exons),
                in_ref_exons = index %in% S4Vectors::queryHits(overlap_ref_exons)
            ) %>%
            dplyr::mutate(
                diff_type = dplyr::case_when(
                    in_exons & in_ref_exons ~ "both",
                    in_exons & !in_ref_exons ~ "not_in_ref",
                    !in_exons & in_ref_exons ~ "in_ref"
                )
            )

        # add back in group info
        diff_curr[[group_var]] <- groups[i]

        # keep only diffs and necessary cols
        diffs[[i]] <-
            diff_curr %>%
            dplyr::filter(diff_type != "both") %>%
            dplyr::select(-in_exons, -in_ref_exons, -index)
    }

    diffs <- diffs %>% do.call(dplyr::bind_rows, .)

    return(diffs)
}


================================================
FILE: R/to_intron.R
================================================
#' Convert exon co-ordinates to introns
#'
#' Given a set of `exons`, `to_intron()` will return the corresponding introns.
#'
#' It is important to note that, for visualization purposes, `to_intron()`
#' defines introns precisely as the exon boundaries, rather than the intron
#' start/end being (exon end + 1)/(exon start - 1).
#'
#' @inheritParams to_diff
#'
#' @return `data.frame()` contains the intron co-ordinates.
#'
#' @export
#' @examples
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' # to_intron() is a helper function included in ggtranscript
#' # which is useful for converting exon co-ordinates to introns
#' sod1_introns <- sod1_exons %>% to_intron(group_var = "transcript_name")
#' sod1_introns %>% head()
#'
#' # this can be particular useful when combined with
#' # geom_range() and geom_intron()
#' # to visualize the core components of transcript annotation
#' sod1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = to_intron(sod1_exons, "transcript_name")
#'     )
to_intron <- function(exons, group_var = NULL) {
    .check_coord_object(exons)
    .check_group_var(exons, group_var)

    # TODO - switch this to using GenomicRanges::gaps()?

    if (!is.null(group_var)) {
        exons <- exons %>% dplyr::group_by_at(.vars = group_var)
    }

    # make sure exons are arranged by coord, so that dplyr::lag works correctly
    exons <- exons %>%
        dplyr::arrange(start, end)

    # obtain intron start and ends
    introns <- exons %>%
        dplyr::mutate(
            intron_start := dplyr::lag(end),
            intron_end := start,
            type = "intron"
        ) %>%
        dplyr::select(-start, -end)

    # remove the introduced artifact NAs
    introns <- introns %>%
        dplyr::ungroup() %>%
        dplyr::filter(!is.na(intron_start) & !is.na(intron_end))

    # filter out introns with a width of 1, this should only happen when
    # utrs are included and are directly adjacent to end of cds
    introns <- introns %>% dplyr::filter(abs(intron_end - intron_start) != 1)

    introns <- introns %>% dplyr::rename(start = intron_start, end = intron_end)

    return(introns)
}


================================================
FILE: R/utils.R
================================================
#' @keywords internal
#' @noRd
.check_coord_object <- function(x,
                                check_seqnames = FALSE,
                                check_strand = FALSE) {
    if (!is.data.frame(x)) {
        stop(
            "object must be a data.frame. ",
            "GRanges objects are currently not supported and must be converted ",
            "using e.g. as.data.frame()"
        )
    }

    if (!all(c("start", "end") %in% colnames(x))) {
        stop("object must have the columns 'start' and 'end'")
    }

    if (check_seqnames) {
        if (!("seqnames" %in% colnames(x))) {
            stop("object must have the column 'seqnames'")
        }
    }

    if (check_strand) {
        if (!("strand" %in% colnames(x))) {
            stop("object must have the column 'strand'")
        }
    }
}

#' @keywords internal
#' @noRd
.check_group_var <- function(x, group_var) {
    if (!is.null(group_var)) {
        if (!all(group_var %in% colnames(x))) {
            stop(
                "group_var ('", group_var, "') ",
                "must be a column in object"
            )
        }
    }
}


================================================
FILE: README.Rmd
================================================
---
output: github_document
---

```{r, include = FALSE}
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    fig.path = "man/figures/README-",
    out.width = "100%",
    dpi = 300
)
```

# ggtranscript <img src="man/figures/ggtranscript_logo_cropped.svg" align="right" height="139" />

<!-- badges: start -->
[![GitHub issues](https://img.shields.io/github/issues/dzhang32/ggtranscript)](https://github.com/dzhang32/ggtranscript/issues)
[![GitHub pulls](https://img.shields.io/github/issues-pr/dzhang32/ggtranscript)](https://github.com/dzhang32/ggtranscript/pulls)
[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
[![R-CMD-check-bioc](https://github.com/dzhang32/ggtranscript/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/dzhang32/ggtranscript/actions)
[![Codecov test coverage](https://codecov.io/gh/dzhang32/ggtranscript/branch/main/graph/badge.svg)](https://app.codecov.io/gh/dzhang32/ggtranscript?branch=main)
<!-- badges: end -->

`ggtranscript` is a `ggplot2` extension that makes it to easy to visualize transcript structure and annotation. 

## Installation

```{r "install_dev", eval = FALSE}
# you can install the development version of ggtranscript from GitHub:
# install.packages("devtools")
devtools::install_github("dzhang32/ggtranscript")
```

## Usage

`ggtranscript` introduces 5 new geoms (`geom_range()`, `geom_half_range()`, `geom_intron()`, `geom_junction()` and `geom_junction_label_repel()`) and several helper functions designed to facilitate the visualization of transcript structure and annotation. The following guide takes you on a quick tour of using these geoms, for a more detailed overview see the [Getting Started tutorial](https://dzhang32.github.io/ggtranscript/articles/ggtranscript.html).

`geom_range()` and `geom_intron()` enable the plotting of exons and introns, the core components of transcript annotation. `ggtranscript` also provides `to_intron()`, which converts exon co-ordinates to the corresponding introns. Together, `ggtranscript` enables users to plot transcript structures with only exons as the required input and just a few lines of code.  

```{r geom-range-intron}
library(magrittr)
library(dplyr)
library(ggplot2)
library(ggtranscript)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation %>% head()

# extract exons
sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")

sod1_exons %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range(
        aes(fill = transcript_biotype)
    ) +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name"),
        aes(strand = strand)
    )
```

`ggtranscript` provides the helper function `shorten_gaps()`, which reduces the size of the gaps. `shorten_gaps()` then rescales the exon and intron co-ordinates to preserve the original exon alignment. This allows you to hone in the differences in the exonic structure, which can be particularly useful if the transcript has relatively long introns. 

```{r shorten-gaps}

sod1_rescaled <- shorten_gaps(
  sod1_exons, 
  to_intron(sod1_exons, "transcript_name"), 
  group_var = "transcript_name"
  )

sod1_rescaled %>%
    dplyr::filter(type == "exon") %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range(
      aes(fill = transcript_biotype)
    ) +
    geom_intron(
        data = sod1_rescaled %>% dplyr::filter(type == "intron"), 
        arrow.min.intron.length = 200
    )

```


`geom_range()` can be used for any range-based genomic annotation. For example, when plotting protein-coding transcripts, users may find it helpful to visually distinguish the coding segments from UTRs. 

```{r geom-range-intron-w-cds}
# filter for only exons from protein coding transcripts
sod1_exons_prot_cod <- sod1_exons %>%
    dplyr::filter(transcript_biotype == "protein_coding")

# obtain cds
sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS")

sod1_exons_prot_cod %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range(
        fill = "white",
        height = 0.25
    ) +
    geom_range(
        data = sod1_cds
    ) +
    geom_intron(
        data = to_intron(sod1_exons_prot_cod, "transcript_name"),
        aes(strand = strand),
        arrow.min.intron.length = 500,
    )
```

`geom_half_range()` takes advantage of the vertical symmetry of transcript annotation by plotting only half of a range on the top or bottom of a transcript structure. One use case of `geom_half_range()` is to visualize the differences between transcript structure more clearly. 

```{r geom-half-range, fig.height = 3}

# extract exons and cds for the two transcripts to be compared
sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202")
sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202")

sod1_201_202_plot <- sod1_201_exons %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = "SOD1-201/202"
    )) +
    geom_half_range(
        fill = "white",
        height = 0.125
    ) +
    geom_half_range(
        data = sod1_201_cds
    ) +
    geom_intron(
        data = to_intron(sod1_201_exons, "transcript_name")
    ) +
    geom_half_range(
        data = sod1_202_exons,
        range.orientation = "top",
        fill = "white",
        height = 0.125
    ) +
    geom_half_range(
        data = sod1_202_cds,
        range.orientation = "top",
        fill = "purple"
    ) +
    geom_intron(
        data = to_intron(sod1_202_exons, "transcript_name")
    )

sod1_201_202_plot

```

As a `ggplot2` extension, `ggtranscript` inherits the the familiarity and functionality of `ggplot2`. For instance, by leveraging `coord_cartesian()` users can zoom in on regions of interest. 

```{r geom-half-range-zoomed, fig.height = 3}

sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000))

```

`geom_junction()` enables to plotting of junction curves, which can be overlaid across transcript structures. `geom_junction_label_repel()` adds a label to junction curves, which can often be useful to mark junctions with a metric of their usage such as read counts. 

```{r geom-junction, fig.height = 3}

# ggtranscript includes a set of example (unannotated) junctions
# originating from GTEx and downloaded via the Bioconductor package snapcount
sod1_junctions

# add transcript_name to junctions for plotting
sod1_junctions <- sod1_junctions %>%
    dplyr::mutate(transcript_name = "SOD1-201")

sod1_201_exons %>%
  ggplot(aes(
    xstart = start,
    xend = end,
    y = transcript_name
  )) +
  geom_range(
    fill = "white", 
    height = 0.25
  ) +
  geom_range(
    data = sod1_201_cds
  ) + 
  geom_intron(
    data = to_intron(sod1_201_exons, "transcript_name")
  ) + 
  geom_junction(
    data = sod1_junctions,
    junction.y.max = 0.5
  ) +
  geom_junction_label_repel(
    data = sod1_junctions,
    aes(label = round(mean_count, 2)),
    junction.y.max = 0.5
  )

```

Alternatively, users may prefer to map junction read counts to the thickness of the junction curves. As a `ggplot2` extension, this can be done intuitively by modifying the size `aes()` of `geom_junction()`. In addition, by modifying `ggplot2` scales and themes, users can easily create informative, publication-ready plots.

```{r geom-junction-pub, fig.height = 3}

sod1_201_exons %>%
  ggplot(aes(
    xstart = start,
    xend = end,
    y = transcript_name
  )) +
  geom_range(
    fill = "white", 
    height = 0.25
  ) +
  geom_range(
    data = sod1_201_cds
  ) + 
  geom_intron(
    data = to_intron(sod1_201_exons, "transcript_name")
  ) + 
  geom_junction(
    data = sod1_junctions,
    aes(size = mean_count),
    junction.y.max = 0.5, 
    ncp = 30, 
    colour = "purple"
  ) + 
  scale_size_continuous(range = c(0.1, 1), guide = "none") + 
  xlab("Genomic position (chr21)") + 
  ylab("Transcript name") + 
  theme_bw()

```

## Citation

```{r citing-ggtranscript}

citation("ggtranscript")

```

## Credits

* `ggtranscript` was developed using `biocthis`.


================================================
FILE: README.md
================================================

# ggtranscript <img src="man/figures/ggtranscript_logo_cropped.svg" align="right" height="139" />

<!-- badges: start -->

[![GitHub
issues](https://img.shields.io/github/issues/dzhang32/ggtranscript)](https://github.com/dzhang32/ggtranscript/issues)
[![GitHub
pulls](https://img.shields.io/github/issues-pr/dzhang32/ggtranscript)](https://github.com/dzhang32/ggtranscript/pulls)
[![Lifecycle:
experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
[![R-CMD-check-bioc](https://github.com/dzhang32/ggtranscript/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/dzhang32/ggtranscript/actions)
[![Codecov test
coverage](https://codecov.io/gh/dzhang32/ggtranscript/branch/main/graph/badge.svg)](https://app.codecov.io/gh/dzhang32/ggtranscript?branch=main)
<!-- badges: end -->

`ggtranscript` is a `ggplot2` extension that makes it to easy to
visualize transcript structure and annotation.

## Installation

``` r
# you can install the development version of ggtranscript from GitHub:
# install.packages("devtools")
devtools::install_github("dzhang32/ggtranscript")
```

## Usage

`ggtranscript` introduces 5 new geoms (`geom_range()`,
`geom_half_range()`, `geom_intron()`, `geom_junction()` and
`geom_junction_label_repel()`) and several helper functions designed to
facilitate the visualization of transcript structure and annotation. The
following guide takes you on a quick tour of using these geoms, for a
more detailed overview see the [Getting Started
tutorial](https://dzhang32.github.io/ggtranscript/articles/ggtranscript.html).

`geom_range()` and `geom_intron()` enable the plotting of exons and
introns, the core components of transcript annotation. `ggtranscript`
also provides `to_intron()`, which converts exon co-ordinates to the
corresponding introns. Together, `ggtranscript` enables users to plot
transcript structures with only exons as the required input and just a
few lines of code.

``` r
library(magrittr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggplot2)
library(ggtranscript)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation %>% head()
#> # A tibble: 6 × 8
#>   seqnames    start      end strand type        gene_name transcript_name
#>   <fct>       <int>    <int> <fct>  <fct>       <chr>     <chr>          
#> 1 21       31659666 31668931 +      gene        SOD1      <NA>           
#> 2 21       31659666 31668931 +      transcript  SOD1      SOD1-202       
#> 3 21       31659666 31659784 +      exon        SOD1      SOD1-202       
#> 4 21       31659770 31659784 +      CDS         SOD1      SOD1-202       
#> 5 21       31659770 31659772 +      start_codon SOD1      SOD1-202       
#> 6 21       31663790 31663886 +      exon        SOD1      SOD1-202       
#> # ℹ 1 more variable: transcript_biotype <chr>

# extract exons
sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")

sod1_exons %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range(
        aes(fill = transcript_biotype)
    ) +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name"),
        aes(strand = strand)
    )
```

<img src="man/figures/README-geom-range-intron-1.png" width="100%" />

`ggtranscript` provides the helper function `shorten_gaps()`, which
reduces the size of the gaps. `shorten_gaps()` then rescales the exon
and intron co-ordinates to preserve the original exon alignment. This
allows you to hone in the differences in the exonic structure, which can
be particularly useful if the transcript has relatively long introns.

``` r

sod1_rescaled <- shorten_gaps(
  sod1_exons, 
  to_intron(sod1_exons, "transcript_name"), 
  group_var = "transcript_name"
  )

sod1_rescaled %>%
    dplyr::filter(type == "exon") %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range(
      aes(fill = transcript_biotype)
    ) +
    geom_intron(
        data = sod1_rescaled %>% dplyr::filter(type == "intron"), 
        arrow.min.intron.length = 200
    )
```

<img src="man/figures/README-shorten-gaps-1.png" width="100%" />

`geom_range()` can be used for any range-based genomic annotation. For
example, when plotting protein-coding transcripts, users may find it
helpful to visually distinguish the coding segments from UTRs.

``` r
# filter for only exons from protein coding transcripts
sod1_exons_prot_cod <- sod1_exons %>%
    dplyr::filter(transcript_biotype == "protein_coding")

# obtain cds
sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS")

sod1_exons_prot_cod %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range(
        fill = "white",
        height = 0.25
    ) +
    geom_range(
        data = sod1_cds
    ) +
    geom_intron(
        data = to_intron(sod1_exons_prot_cod, "transcript_name"),
        aes(strand = strand),
        arrow.min.intron.length = 500,
    )
```

<img src="man/figures/README-geom-range-intron-w-cds-1.png" width="100%" />

`geom_half_range()` takes advantage of the vertical symmetry of
transcript annotation by plotting only half of a range on the top or
bottom of a transcript structure. One use case of `geom_half_range()` is
to visualize the differences between transcript structure more clearly.

``` r

# extract exons and cds for the two transcripts to be compared
sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201")
sod1_202_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-202")
sod1_202_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-202")

sod1_201_202_plot <- sod1_201_exons %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = "SOD1-201/202"
    )) +
    geom_half_range(
        fill = "white",
        height = 0.125
    ) +
    geom_half_range(
        data = sod1_201_cds
    ) +
    geom_intron(
        data = to_intron(sod1_201_exons, "transcript_name")
    ) +
    geom_half_range(
        data = sod1_202_exons,
        range.orientation = "top",
        fill = "white",
        height = 0.125
    ) +
    geom_half_range(
        data = sod1_202_cds,
        range.orientation = "top",
        fill = "purple"
    ) +
    geom_intron(
        data = to_intron(sod1_202_exons, "transcript_name")
    )

sod1_201_202_plot
```

<img src="man/figures/README-geom-half-range-1.png" width="100%" />

As a `ggplot2` extension, `ggtranscript` inherits the the familiarity
and functionality of `ggplot2`. For instance, by leveraging
`coord_cartesian()` users can zoom in on regions of interest.

``` r

sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000))
```

<img src="man/figures/README-geom-half-range-zoomed-1.png" width="100%" />

`geom_junction()` enables to plotting of junction curves, which can be
overlaid across transcript structures. `geom_junction_label_repel()`
adds a label to junction curves, which can often be useful to mark
junctions with a metric of their usage such as read counts.

``` r

# ggtranscript includes a set of example (unannotated) junctions
# originating from GTEx and downloaded via the Bioconductor package snapcount
sod1_junctions
#> # A tibble: 5 × 5
#>   seqnames    start      end strand mean_count
#>   <fct>       <int>    <int> <fct>       <dbl>
#> 1 chr21    31659787 31666448 +           0.463
#> 2 chr21    31659842 31660554 +           0.831
#> 3 chr21    31659842 31663794 +           0.316
#> 4 chr21    31659842 31667257 +           4.35 
#> 5 chr21    31660351 31663789 +           0.324

# add transcript_name to junctions for plotting
sod1_junctions <- sod1_junctions %>%
    dplyr::mutate(transcript_name = "SOD1-201")

sod1_201_exons %>%
  ggplot(aes(
    xstart = start,
    xend = end,
    y = transcript_name
  )) +
  geom_range(
    fill = "white", 
    height = 0.25
  ) +
  geom_range(
    data = sod1_201_cds
  ) + 
  geom_intron(
    data = to_intron(sod1_201_exons, "transcript_name")
  ) + 
  geom_junction(
    data = sod1_junctions,
    junction.y.max = 0.5
  ) +
  geom_junction_label_repel(
    data = sod1_junctions,
    aes(label = round(mean_count, 2)),
    junction.y.max = 0.5
  )
```

<img src="man/figures/README-geom-junction-1.png" width="100%" />

Alternatively, users may prefer to map junction read counts to the
thickness of the junction curves. As a `ggplot2` extension, this can be
done intuitively by modifying the size `aes()` of `geom_junction()`. In
addition, by modifying `ggplot2` scales and themes, users can easily
create informative, publication-ready plots.

``` r

sod1_201_exons %>%
  ggplot(aes(
    xstart = start,
    xend = end,
    y = transcript_name
  )) +
  geom_range(
    fill = "white", 
    height = 0.25
  ) +
  geom_range(
    data = sod1_201_cds
  ) + 
  geom_intron(
    data = to_intron(sod1_201_exons, "transcript_name")
  ) + 
  geom_junction(
    data = sod1_junctions,
    aes(size = mean_count),
    junction.y.max = 0.5, 
    ncp = 30, 
    colour = "purple"
  ) + 
  scale_size_continuous(range = c(0.1, 1), guide = "none") + 
  xlab("Genomic position (chr21)") + 
  ylab("Transcript name") + 
  theme_bw()
#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
```

<img src="man/figures/README-geom-junction-pub-1.png" width="100%" />

## Citation

``` r

citation("ggtranscript")
#> To cite package 'ggtranscript' in publications use:
#> 
#>   Gustavsson EK, Zhang D, Reynolds RH, Garcia-Ruiz S, Ryten M (2022).
#>   "ggtranscript: an R package for the visualization and interpretation
#>   of transcript isoforms using ggplot2." _Bioinformatics_.
#>   doi:10.1093/bioinformatics/btac409
#>   <https://doi.org/10.1093/bioinformatics/btac409>,
#>   <https://academic.oup.com/bioinformatics/article/38/15/3844/6617821>.
#> 
#> A BibTeX entry for LaTeX users is
#> 
#>   @Article{,
#>     title = {ggtranscript: an R package for the visualization and interpretation of transcript isoforms using ggplot2},
#>     author = {Emil K Gustavsson and David Zhang and Regina H Reynolds and Sonia Garcia-Ruiz and Mina Ryten},
#>     year = {2022},
#>     journal = {Bioinformatics},
#>     doi = {https://doi.org/10.1093/bioinformatics/btac409},
#>     url = {https://academic.oup.com/bioinformatics/article/38/15/3844/6617821},
#>   }
```

## Credits

- `ggtranscript` was developed using `biocthis`.


================================================
FILE: _pkgdown.yml
================================================
template:
  bootstrap: 5
  bootswatch: cosmo


================================================
FILE: codecov.yml
================================================
comment: false

coverage:
  status:
    project:
      default:
        target: auto
        threshold: 1%
        informational: true
    patch:
      default:
        target: auto
        threshold: 1%
        informational: true


================================================
FILE: data-raw/ggplot2_exts_thumbnail.R
================================================

# Load libraries ----------------------------------------------------------

library(tidyverse)
devtools::load_all(".")

# Main --------------------------------------------------------------------

sod1_201_exons <- sod1_annotation %>%
    dplyr::filter(
        type == "exon",
        transcript_name == "SOD1-201"
    )

sod1_201_cds <- sod1_annotation %>%
    dplyr::filter(
        type == "CDS",
        transcript_name == "SOD1-201"
    )

sod1_junctions <- sod1_junctions %>% dplyr::mutate(transcript_name = "SOD1-201")

ggplot2_exts_figure <- sod1_201_exons %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range(
        fill = "white",
        height = 0.125
    ) +
    geom_range(
        data = sod1_201_cds,
        height = 0.25
    ) +
    geom_intron(
        data = to_intron(sod1_201_exons, "transcript_name")
    ) +
    geom_junction(
        data = sod1_junctions,
        aes(size = mean_count),
        junction.y.max = 0.25,
        ncp = 30,
        colour = "purple"
    ) +
    scale_size_continuous(range = c(0.1, 1), guide = "none") +
    xlab("Genomic position (chr21)") +
    ylab("Transcript name") +
    theme_bw() +
    theme(
        axis.line = element_line(colour = "black"),
        panel.grid = element_blank(),
        panel.border = element_blank()
    )

ggplot2_exts_figure

# Save data ---------------------------------------------------------------

ggsave(
    plot = ggplot2_exts_figure,
    filename = here::here("man", "figures", "dzhang32-ggtranscript.png"),
    height = 3,
    width = 3.5,
    dpi = 600
)


================================================
FILE: data-raw/ggtranscript_logo.R
================================================

# Load libraries ----------------------------------------------------------

library(tidyverse)
library(hexSticker)
library(showtext)
devtools::load_all(".")

# Main --------------------------------------------------------------------

logo_exons <- tribble(
    ~start, ~end, ~tx, ~letter,
    #-------|-----|----|--------
    150, 200, "J", "T",
    500, 550, "J", "T",
    300, 310, "I", "T",
    350, 400, "I", "T",
    300, 350, "H", "T",
    390, 400, "H", "T",
    300, 310, "G", "T",
    350, 400, "G", "T",
    700, 800, "J", "X_top",
    1100, 1200, "J", "X_top",
    700, 800, "G", "X_bot",
    1100, 1200, "G", "X_bot"
) %>%
    dplyr::mutate(
        tx = tx %>% factor(
            levels = LETTERS[1:17]
        )
    )

logo_utr <- tribble(
    ~start, ~end, ~tx, ~letter,
    #-------|-----|----|--------
    100, 150, "J", "T",
    550, 600, "J", "T",
)

logo_introns <- logo_exons %>%
    dplyr::filter(letter == "T") %>%
    to_intron(group_var = "tx")

logo_junctions <- logo_exons %>%
    dplyr::filter(letter %in% c("X_top", "X_bot")) %>%
    to_intron(group_var = "tx")

size <- 0.3
colour <- "black"
fill <- ggpubr::get_palette("jco", 10)[10]

# create T
ggtranscript_logo <- logo_exons %>%
    dplyr::filter(letter == "T") %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = tx
    )) +
    geom_range(
        fill = fill,
        size = size,
        colour = colour
    ) +
    geom_range(
        data = logo_utr,
        fill = "white",
        height = 0.25,
        size = size,
        colour = colour
    ) +
    geom_intron(
        data = logo_introns,
        size = size,
        colour = colour,
        arrow.min.intron.length = 100
    )

ggtranscript_logo <- ggtranscript_logo +
    geom_half_range(
        data = logo_exons %>% dplyr::filter(letter == "X_top"),
        range.orientation = "top",
        fill = fill,
        size = size,
        colour = colour,
    ) +
    geom_half_range(
        data = logo_exons %>% dplyr::filter(letter == "X_bot"),
        fill = fill,
        size = size,
        colour = colour,
    ) +
    geom_junction(
        data = logo_junctions %>% dplyr::filter(letter == "X_bot"),
        size = size,
        colour = colour,
        junction.orientation = "top",
        junction.y.max = 1.4,
        ncp = 50
    ) +
    geom_junction(
        data = logo_junctions %>% dplyr::filter(letter == "X_top"),
        size = size,
        colour = colour,
        junction.orientation = "bottom",
        junction.y.max = 1.4,
        ncp = 50
    )

ggtranscript_logo <- ggtranscript_logo +
    scale_x_continuous(
        limits = c(-300, 1600),
        minor_breaks = seq(-300, 1500, 100)
    ) +
    scale_y_discrete(drop = FALSE) +
    theme_bw() +
    theme(
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        panel.grid.major = element_line(
            size = size,
            colour = ggpubr::get_palette("Greys", 10)[2]
        ),
        panel.grid.minor = element_line(
            size = size,
            colour = ggpubr::get_palette("Greys", 10)[2]
        )
    )

ggtranscript_logo

# Save data ---------------------------------------------------------------

# use font from https://fonts.google.com
font_add_google(name = "Raleway", family = "Raleway")
showtext_auto()

ggtranscript_logo_hex <- hexSticker::sticker(
    # the plot (TX)
    subplot = ggtranscript_logo,
    s_x = 0.98,
    s_y = 1.2,
    s_width = 2.8,
    s_height = 3,
    # the package
    package = "ggtranscript",
    p_x = 1,
    p_y = 0.65,
    p_size = 35,
    p_family = "Raleway",
    p_fontface = "bold",
    p_color = ggpubr::get_palette("jco", 10)[6],
    # hex border
    h_color = ggpubr::get_palette("jco", 10)[6],
    h_fill = "white",
    h_size = 2,
    # url
    url = "https://github.com/dzhang32/ggtranscript",
    u_family = "Raleway",
    u_color = ggpubr::get_palette("jco", 10)[6],
    u_size = 6.5,
    # general
    filename = here::here("man", "figures", "ggtranscript_logo.png"),
    dpi = 600,
    white_around_sticker = TRUE
)

# here::here("man", "figures", "ggtranscript_logo.png") is then
# manually cropped to remove the white background in Inkscape
plot(ggtranscript_logo_hex)


================================================
FILE: data-raw/sod1_junctions.R
================================================

# Load libraries ----------------------------------------------------------

library(tidyverse)
library(snapcount)
library(SummarizedExperiment)

# Main --------------------------------------------------------------------

# obtain gtex junctions across SOD1
sod1_query <- snapcount::QueryBuilder(compilation = "gtex", regions = "SOD1")

# keeping only unannotated junctions
# from liver where SOD1 is highly expressed
# https://gtexportal.org/home/gene/SOD1
sod1_query <- set_row_filters(sod1_query, annotated == 0)
sod1_query <- set_column_filters(sod1_query, SMTS == "Liver")

sod1_junctions <- snapcount::query_jx(sod1_query)

# obtain mean counts
mean_counts <-
    sod1_junctions %>%
    SummarizedExperiment::assays() %>%
    .[["counts"]] %>%
    as.matrix() %>%
    rowMeans()

sod1_junctions <- sod1_junctions %>%
    SummarizedExperiment::rowRanges() %>%
    as.data.frame() %>%
    dplyr::as_tibble()

# minor QC and tidying of the junctions
sod1_junctions <-
    sod1_junctions %>%
    dplyr::mutate(mean_count = mean_counts) %>%
    dplyr::filter(mean_count > 0.3) %>%
    dplyr::select(
        seqnames,
        start,
        end,
        strand,
        mean_count
    )

# Save data ---------------------------------------------------------------

usethis::use_data(
    sod1_junctions,
    compress = "gzip",
    overwrite = TRUE
)


================================================
FILE: data-raw/sod1_pknox1_annotation.R
================================================

# Load libraries ----------------------------------------------------------

library(tidyverse)
library(rtracklayer)
library(R.utils)

# Main --------------------------------------------------------------------

gtf_path <- file.path(tempdir(), "Homo_sapiens.GRCh38.105.chr.gtf.gz")

# download ens 105 gtf
download.file(
    stringr::str_c(
        "http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/",
        "Homo_sapiens.GRCh38.105.chr.gtf.gz"
    ),
    destfile = gtf_path
)

# unzip gtf
R.utils::gunzip(gtf_path)

gtf_path <- gtf_path %>%
    stringr::str_remove("\\.gz$")

gtf <- rtracklayer::import(gtf_path)

# extract example gene transcripts
# convert to tibble()
sod1_annotation <-
    gtf[!is.na(gtf$gene_name) & gtf$gene_name == "SOD1"] %>%
    as.data.frame() %>%
    dplyr::as_tibble() %>%
    dplyr::select(
        seqnames,
        start,
        end,
        strand,
        type,
        gene_name,
        transcript_name,
        transcript_biotype
    )

pknox1_annotation <-
    gtf[!is.na(gtf$gene_name) & gtf$gene_name == "PKNOX1"] %>%
    as.data.frame() %>%
    dplyr::as_tibble() %>%
    dplyr::select(
        seqnames,
        start,
        end,
        strand,
        type,
        gene_name,
        transcript_name,
        transcript_biotype
    )

# Save data ---------------------------------------------------------------

usethis::use_data(
    sod1_annotation,
    compress = "gzip",
    overwrite = TRUE
)

usethis::use_data(
    pknox1_annotation,
    compress = "gzip",
    overwrite = TRUE
)


================================================
FILE: inst/CITATION
================================================
pkgVer <- function(pkg) {
    if (!exists("meta") || is.null(meta)) meta <- packageDescription(pkg)
    ver <- meta$Version
    paste0('https://github.com/dzhang32/', pkg, ' - R package version ', ver)
}

c(
    bibentry(bibtype="article",
        title = "ggtranscript: an R package for the visualization and interpretation of transcript isoforms using ggplot2",
        author = personList(
            as.person("Emil K Gustavsson"), 
            as.person("David Zhang"),
            as.person("Regina H Reynolds"), 
            as.person("Sonia Garcia-Ruiz"),
            as.person("Mina Ryten")
        ),
        year = 2022,
        journal = "Bioinformatics",
        doi = "https://doi.org/10.1093/bioinformatics/btac409",
        url = "https://academic.oup.com/bioinformatics/article/38/15/3844/6617821"
    )
)


================================================
FILE: man/add_exon_number.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/add_exon_number.R
\name{add_exon_number}
\alias{add_exon_number}
\title{Add exon number}
\usage{
add_exon_number(exons, group_var = NULL)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}

\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
}
\value{
\code{data.frame()} equivalent to input \code{exons}, with the additional
column "exon_number".
}
\description{
\code{add_exon_number()} adds the exon number (the order the exons are transcribed
within each transcript) as a column in \code{exons}. This can be useful when
visualizing long, complex transcript structures, in order to keep track of
specific exons of interest.
}
\details{
To note, a "strand" column must be present within \code{exons}. The strand is used
to differentiate whether exon numbers should be calculated according to
ascending ("+") or descending ("-") genomic co-ordinates. For ambiguous
strands ("*"), \code{add_exon_number()} will be assume the strand be "+".
}
\examples{
library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()

# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon")
sod1_exons \%>\% head()

# add the exon number for each transcript
sod1_exons <- sod1_exons \%>\% add_exon_number(group_var = "transcript_name")

base <- sod1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name"),
        strand = "+"
    )

# it can be useful to annotate exons with their exon number
# using ggplot2::geom_text()
base +
    geom_text(aes(
        x = (start + end) / 2, # plot label at midpoint of exon
        label = exon_number
    ),
    size = 3.5,
    nudge_y = 0.4
    )

# Or alternatively, using ggrepel::geom_label_repel()
# to separate labels from exons
base +
    ggrepel::geom_label_repel(ggplot2::aes(
        x = (start + end) / 2,
        label = exon_number
    ),
    size = 3.5,
    min.segment.length = 0
    )
}


================================================
FILE: man/add_utr.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/add_utr.R
\name{add_utr}
\alias{add_utr}
\title{Add untranslated regions (UTRs)}
\usage{
add_utr(exons, cds, group_var = NULL)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}

\item{cds}{\code{data.frame()} contains coding sequence ranges for the transcripts
in \code{exons}.}

\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
}
\value{
\code{data.frame()} contains differentiated CDS and UTR ranges.
}
\description{
Given a set of \code{exons} (encompassing the CDS and UTRs) and \code{cds} regions,
\code{add_utr()} will calculate and add the corresponding UTR regions as ranges.
This can be useful when combined with \code{shorten_gaps()} to visualize
transcripts with long introns, whilst differentiating UTRs from CDS regions.
}
\details{
The definition of the inputted \code{cds} regions are expected to range from the
beginning of the start codon to the end of the stop codon. Sometimes, for
example in the case of Ensembl, reference annotation will omit the stop
codons from the CDS definition. In such cases, users should manually ensure
that the \code{cds} includes both the start and stop codons.
}
\examples{

library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
pknox1_annotation \%>\% head()

# extract exons
pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon")
pknox1_exons \%>\% head()

# extract cds
pknox1_cds <- pknox1_annotation \%>\% dplyr::filter(type == "CDS")
pknox1_cds \%>\% head()

# the CDS definition originating from the Ensembl reference annotation
# does not include the stop codon
# we must incorporate the stop codons into the CDS manually
# by adding 3 base pairs to the end of the CDS of each transcript
pknox1_cds_w_stop <- pknox1_cds \%>\%
    dplyr::group_by(transcript_name) \%>\%
    dplyr::mutate(
        end = ifelse(end == max(end), end + 3, end)
    ) \%>\%
    dplyr::ungroup()

# add_utr() adds ranges that represent the UTRs
pknox1_cds_utr <- add_utr(
    pknox1_exons,
    pknox1_cds_w_stop,
    group_var = "transcript_name"
)

pknox1_cds_utr \%>\% head()

# this can be useful when combined with shorten_gaps()
# to visualize transcripts with long introns whilst differentiating UTRs
pknox1_cds_utr_rescaled <-
    shorten_gaps(
        exons = pknox1_cds_utr,
        introns = to_intron(pknox1_cds_utr, "transcript_name"),
        group_var = "transcript_name"
    )

pknox1_cds_utr_rescaled \%>\%
    dplyr::filter(type == "CDS") \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_range(
        data = pknox1_cds_utr_rescaled \%>\% dplyr::filter(type == "UTR"),
        height = 0.25,
        fill = "white"
    ) +
    geom_intron(
        data = to_intron(
            pknox1_cds_utr_rescaled \%>\% dplyr::filter(type != "intron"),
            "transcript_name"
        ),
        arrow.min.intron.length = 110
    )
}


================================================
FILE: man/geom_intron.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/geom_intron.R
\name{geom_intron}
\alias{geom_intron}
\title{Plot intron lines with strand arrows}
\usage{
geom_intron(
  mapping = NULL,
  data = NULL,
  stat = "identity",
  position = "identity",
  ...,
  arrow = grid::arrow(ends = "last", length = grid::unit(0.1, "inches")),
  arrow.fill = NULL,
  lineend = "butt",
  linejoin = "round",
  na.rm = FALSE,
  arrow.min.intron.length = 0,
  show.legend = NA,
  inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and
\code{inherit.aes = TRUE} (the default), it is combined with the default mapping
at the top level of the plot. You must supply \code{mapping} if there is no plot
mapping.}

\item{data}{The data to be displayed in this layer. There are three
options:

If \code{NULL}, the default, the data is inherited from the plot
data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}.

A \code{data.frame}, or other object, will override the plot
data. All objects will be fortified to produce a data frame. See
\code{\link[ggplot2:fortify]{fortify()}} for which variables will be created.

A \code{function} will be called with a single argument,
the plot data. The return value must be a \code{data.frame}, and
will be used as the layer data. A \code{function} can be created
from a \code{formula} (e.g. \code{~ head(.x, 10)}).}

\item{stat}{The statistical transformation to use on the data for this layer.
When using a \verb{geom_*()} function to construct a layer, the \code{stat}
argument can be used the override the default coupling between geoms and
stats. The \code{stat} argument accepts the following:
\itemize{
\item A \code{Stat} ggproto subclass, for example \code{StatCount}.
\item A string naming the stat. To give the stat as a string, strip the
function name of the \code{stat_} prefix. For example, to use \code{stat_count()},
give the stat as \code{"count"}.
\item For more information and other ways to specify the stat, see the
\link[ggplot2:layer_stats]{layer stat} documentation.
}}

\item{position}{A position adjustment to use on the data for this layer. This
can be used in various ways, including to prevent overplotting and
improving the display. The \code{position} argument accepts the following:
\itemize{
\item The result of calling a position function, such as \code{position_jitter()}.
This method allows for passing extra arguments to the position.
\item A string naming the position adjustment. To give the position as a
string, strip the function name of the \code{position_} prefix. For example,
to use \code{position_jitter()}, give the position as \code{"jitter"}.
\item For more information and other ways to specify the position, see the
\link[ggplot2:layer_positions]{layer position} documentation.
}}

\item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These
arguments broadly fall into one of 4 categories below. Notably, further
arguments to the \code{position} argument, or aesthetics that are required
can \emph{not} be passed through \code{...}. Unknown arguments that are not part
of the 4 categories below are ignored.
\itemize{
\item Static aesthetics that are not mapped to a scale, but are at a fixed
value and apply to the layer as a whole. For example, \code{colour = "red"}
or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics}
section that lists the available options. The 'required' aesthetics
cannot be passed on to the \code{params}. Please note that while passing
unmapped aesthetics as vectors is technically possible, the order and
required length is not guaranteed to be parallel to the input data.
\item When constructing a layer using
a \verb{stat_*()} function, the \code{...} argument can be used to pass on
parameters to the \code{geom} part of the layer. An example of this is
\code{stat_density(geom = "area", outline.type = "both")}. The geom's
documentation lists which parameters it can accept.
\item Inversely, when constructing a layer using a
\verb{geom_*()} function, the \code{...} argument can be used to pass on parameters
to the \code{stat} part of the layer. An example of this is
\code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation
lists which parameters it can accept.
\item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through
\code{...}. This can be one of the functions described as
\link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend.
}}

\item{arrow}{specification for arrow heads, as created by \code{\link[grid:arrow]{grid::arrow()}}.}

\item{arrow.fill}{fill colour to use for the arrow head (if closed). \code{NULL}
means use \code{colour} aesthetic.}

\item{lineend}{Line end style (round, butt, square).}

\item{linejoin}{Line join style (round, mitre, bevel).}

\item{na.rm}{If \code{FALSE}, the default, missing values are removed with
a warning. If \code{TRUE}, missing values are silently removed.}

\item{arrow.min.intron.length}{\code{integer()} the minimum required width of an
intron for a strand arrow to be drawn. This can be useful to remove strand
arrows on short introns that overlap adjacent exons.}

\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped.
\code{FALSE} never includes, and \code{TRUE} always includes.
It can also be a named logical vector to finely select the aesthetics to
display.}

\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
rather than combining with them. This is most useful for helper functions
that define both data and aesthetics and shouldn't inherit behaviour from
the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.}
}
\value{
the return value of a \verb{geom_*} function is not intended to be
directly handled by users. Therefore, \verb{geom_*} functions should never be
executed in isolation, rather used in combination with a
\code{ggplot2::ggplot()} call.
}
\description{
\code{geom_intron()} draws horizontal lines with central arrows that are designed
to represent introns. In combination with \code{geom_range()}/\code{geom_half_range()},
these geoms form the core components for visualizing transcript structures.
}
\details{
\code{geom_intron()} requires the following \code{aes()}; \code{xstart}, \code{xend} and \code{y}
(e.g. transcript name). If users do not have intron co-ordinates, these can
be generated from the corresponding exons using \code{to_intron()}. The \code{strand}
option (one of "+" or "-") adjusts the arrow direction to match the direction
of transcription. The \code{arrow.min.intron.length} parameter can be useful to
remove strand arrows that overlap exons, which can be a problem if plotted
introns include those that are relatively short.
}
\examples{

library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
pknox1_annotation \%>\% head()

# extract exons
pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon")
pknox1_exons \%>\% head()

# to_intron() is a helper function included in ggtranscript
# which is useful for converting exon co-ordinates to introns
pknox1_introns <- pknox1_exons \%>\% to_intron(group_var = "transcript_name")
pknox1_introns \%>\% head()

base <- pknox1_introns \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    ))

# by default, geom_intron() assumes introns originate from the "+" strand
base + geom_intron()

# however this can be modified using the strand option
base + geom_intron(strand = "-")

# strand can also be set as an aes()
base + geom_intron(aes(strand = strand))

# as a ggplot2 extension, ggtranscript geoms inherit the
# the functionality from the parameters and aesthetics in ggplot2
base + geom_intron(
    aes(colour = transcript_name),
    linewidth = 1
)

# together, geom_range() and geom_intron() are designed to visualize
# the core components of transcript annotation
pknox1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = pknox1_introns
    )

# for short introns, sometimes strand arrows will overlap exons
# to avoid this, users can set the arrow.min.intron.length parameter
pknox1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = pknox1_introns,
        arrow.min.intron.length = 3500
    )
}


================================================
FILE: man/geom_junction.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/geom_junction.R
\name{geom_junction}
\alias{geom_junction}
\title{Plot junction curves}
\usage{
geom_junction(
  mapping = NULL,
  data = NULL,
  stat = "identity",
  position = "identity",
  junction.orientation = "alternating",
  junction.y.max = 1,
  angle = 90,
  ncp = 15,
  na.rm = FALSE,
  orientation = NA,
  show.legend = NA,
  inherit.aes = TRUE,
  ...
)
}
\arguments{
\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and
\code{inherit.aes = TRUE} (the default), it is combined with the default mapping
at the top level of the plot. You must supply \code{mapping} if there is no plot
mapping.}

\item{data}{The data to be displayed in this layer. There are three
options:

If \code{NULL}, the default, the data is inherited from the plot
data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}.

A \code{data.frame}, or other object, will override the plot
data. All objects will be fortified to produce a data frame. See
\code{\link[ggplot2:fortify]{fortify()}} for which variables will be created.

A \code{function} will be called with a single argument,
the plot data. The return value must be a \code{data.frame}, and
will be used as the layer data. A \code{function} can be created
from a \code{formula} (e.g. \code{~ head(.x, 10)}).}

\item{stat}{The statistical transformation to use on the data for this layer.
When using a \verb{geom_*()} function to construct a layer, the \code{stat}
argument can be used the override the default coupling between geoms and
stats. The \code{stat} argument accepts the following:
\itemize{
\item A \code{Stat} ggproto subclass, for example \code{StatCount}.
\item A string naming the stat. To give the stat as a string, strip the
function name of the \code{stat_} prefix. For example, to use \code{stat_count()},
give the stat as \code{"count"}.
\item For more information and other ways to specify the stat, see the
\link[ggplot2:layer_stats]{layer stat} documentation.
}}

\item{position}{A position adjustment to use on the data for this layer. This
can be used in various ways, including to prevent overplotting and
improving the display. The \code{position} argument accepts the following:
\itemize{
\item The result of calling a position function, such as \code{position_jitter()}.
This method allows for passing extra arguments to the position.
\item A string naming the position adjustment. To give the position as a
string, strip the function name of the \code{position_} prefix. For example,
to use \code{position_jitter()}, give the position as \code{"jitter"}.
\item For more information and other ways to specify the position, see the
\link[ggplot2:layer_positions]{layer position} documentation.
}}

\item{junction.orientation}{\code{character()} one of "alternating", "top" or
"bottom", specifying where the junctions will be plotted with respect to
each transcript (\code{y}).}

\item{junction.y.max}{\code{double()} the max y-value of each junction curve. It
can be useful to adjust this parameter when junction curves overlap with
one another/other transcripts or extend beyond the plot margins.}

\item{angle}{A numeric value between 0 and 180,
    giving an amount to skew the control
    points of the curve.  Values less than 90 skew the curve towards
    the start point and values greater than 90 skew the curve
    towards the end point.}

\item{ncp}{The number of control points used to draw the curve.
    More control points creates a smoother curve.}

\item{na.rm}{If \code{FALSE}, the default, missing values are removed with
a warning. If \code{TRUE}, missing values are silently removed.}

\item{orientation}{The orientation of the layer. The default (\code{NA})
automatically determines the orientation from the aesthetic mapping. In the
rare event that this fails it can be given explicitly by setting \code{orientation}
to either \code{"x"} or \code{"y"}. See the \emph{Orientation} section for more detail.}

\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped.
\code{FALSE} never includes, and \code{TRUE} always includes.
It can also be a named logical vector to finely select the aesthetics to
display.}

\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
rather than combining with them. This is most useful for helper functions
that define both data and aesthetics and shouldn't inherit behaviour from
the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.}

\item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These
arguments broadly fall into one of 4 categories below. Notably, further
arguments to the \code{position} argument, or aesthetics that are required
can \emph{not} be passed through \code{...}. Unknown arguments that are not part
of the 4 categories below are ignored.
\itemize{
\item Static aesthetics that are not mapped to a scale, but are at a fixed
value and apply to the layer as a whole. For example, \code{colour = "red"}
or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics}
section that lists the available options. The 'required' aesthetics
cannot be passed on to the \code{params}. Please note that while passing
unmapped aesthetics as vectors is technically possible, the order and
required length is not guaranteed to be parallel to the input data.
\item When constructing a layer using
a \verb{stat_*()} function, the \code{...} argument can be used to pass on
parameters to the \code{geom} part of the layer. An example of this is
\code{stat_density(geom = "area", outline.type = "both")}. The geom's
documentation lists which parameters it can accept.
\item Inversely, when constructing a layer using a
\verb{geom_*()} function, the \code{...} argument can be used to pass on parameters
to the \code{stat} part of the layer. An example of this is
\code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation
lists which parameters it can accept.
\item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through
\code{...}. This can be one of the functions described as
\link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend.
}}
}
\value{
the return value of a \verb{geom_*} function is not intended to be
directly handled by users. Therefore, \verb{geom_*} functions should never be
executed in isolation, rather used in combination with a
\code{ggplot2::ggplot()} call.
}
\description{
\code{geom_junction()} draws curves that are designed to represent junction reads
from RNA-sequencing data. It can be useful to overlay junction data on
transcript annotation (plotted using \code{geom_range()}/\code{geom_half_range()} and
\code{geom_intron()}) to understand which splicing events or transcripts have
support from RNA-sequencing data.
}
\details{
\code{geom_junction()} requires the following \code{aes()}; \code{xstart}, \code{xend} and \code{y}
(e.g. transcript name). \code{geom_junction()} curves can be modified using
\code{junction.y.max}, which can be useful when junctions overlap one
another/other transcripts or extend beyond the plot margins. By default,
junction curves will alternate between being plotted on the top and bottom of
each transcript (\code{y}), however this can be modified via
\code{junction.orientation}.
}
\examples{

library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()

# as well as a set of example (unannotated) junctions
# originating from GTEx and downloaded via the Bioconductor package snapcount
sod1_junctions

# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(
    type == "exon",
    transcript_name == "SOD1-201"
)
sod1_exons \%>\% head()

# add transcript_name to junctions for plotting
sod1_junctions <- sod1_junctions \%>\%
    dplyr::mutate(transcript_name = "SOD1-201")

# junctions can be plotted as curves using geom_junction()
base <- sod1_junctions \%>\%
    ggplot2::ggplot(ggplot2::aes(
        xstart = start,
        xend = end,
        y = transcript_name
    ))

# sometimes, depending on the number and widths of transcripts and junctions
# junctions will go overlap one another or extend beyond the plot margin
base + geom_junction()

# in such cases, junction.y.max can be adjusted to modify the max y of curves
base + geom_junction(junction.y.max = 0.5)

# ncp can be used improve the smoothness of curves
base + geom_junction(junction.y.max = 0.5, ncp = 30)

# junction.orientation controls where the junction are plotted
# with respect to each transcript
# either alternating (default), or on the top or bottom
base + geom_junction(junction.orientation = "top", junction.y.max = 0.5)
base + geom_junction(junction.orientation = "bottom", junction.y.max = 0.5)

# it can be useful useful to overlay junction curves onto existing annotation
# plotted using geom_range() and geom_intron()
base <- sod1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name")
    )

base + geom_junction(
    data = sod1_junctions,
    junction.y.max = 0.5
)

# as a ggplot2 extension, ggtranscript geoms inherit the
# the functionality from the parameters and aesthetics in ggplot2
# this can be useful when mapping junction thickness to their counts
base + geom_junction(
    data = sod1_junctions,
    aes(linewidth = mean_count),
    junction.y.max = 0.5,
    colour = "purple"
) +
    scale_linewidth(range = c(0.1, 1))

# it can be useful to combine geom_junction() with geom_half_range()
sod1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_half_range() +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name")
    ) +
    geom_junction(
        data = sod1_junctions,
        aes(linewidth = mean_count),
        junction.y.max = 0.5,
        junction.orientation = "top",
        colour = "purple"
    ) +
    scale_linewidth(range = c(0.1, 1))
}


================================================
FILE: man/geom_junction_label_repel.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/geom_junction_label_repel.R
\name{geom_junction_label_repel}
\alias{geom_junction_label_repel}
\title{Label junction curves}
\usage{
geom_junction_label_repel(
  mapping = NULL,
  data = NULL,
  stat = "identity",
  position = "identity",
  parse = FALSE,
  ...,
  junction.orientation = "alternating",
  junction.y.max = 1,
  angle = 90,
  ncp = 15,
  box.padding = 0.25,
  label.padding = 0.25,
  point.padding = 1e-06,
  label.r = 0.15,
  label.size = 0.25,
  min.segment.length = 0,
  arrow = NULL,
  force = 1,
  force_pull = 1,
  max.time = 0.5,
  max.iter = 10000,
  max.overlaps = getOption("ggrepel.max.overlaps", default = 10),
  nudge_x = 0,
  nudge_y = 0,
  xlim = c(NA, NA),
  ylim = c(NA, NA),
  na.rm = FALSE,
  show.legend = NA,
  direction = c("both", "y", "x"),
  seed = NA,
  verbose = FALSE,
  inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2]{aes}} or
\code{\link[ggplot2]{aes_}}. If specified and \code{inherit.aes = TRUE} (the
default), is combined with the default mapping at the top level of the
plot. You only need to supply \code{mapping} if there isn't a mapping
defined for the plot.}

\item{data}{A data frame. If specified, overrides the default data frame
defined at the top level of the plot.}

\item{stat}{The statistical transformation to use on the data for this
layer, as a string.}

\item{position}{Position adjustment, either as a string, or the result of
a call to a position adjustment function.}

\item{parse}{If TRUE, the labels will be parsed into expressions and
displayed as described in ?plotmath}

\item{...}{other arguments passed on to \code{\link[ggplot2]{layer}}. There are
  three types of arguments you can use here:

  \itemize{
    \item Aesthetics: to set an aesthetic to a fixed value, like
       \code{colour = "red"} or \code{size = 3}.
    \item Other arguments to the layer, for example you override the
      default \code{stat} associated with the layer.
    \item Other arguments passed on to the stat.
  }}

\item{junction.orientation}{\code{character()} one of "alternating", "top" or
"bottom", specifying where the junctions will be plotted with respect to
each transcript (\code{y}).}

\item{junction.y.max}{\code{double()} the max y-value of each junction curve. It
can be useful to adjust this parameter when junction curves overlap with
one another/other transcripts or extend beyond the plot margins.}

\item{angle}{A numeric value between 0 and 180,
    giving an amount to skew the control
    points of the curve.  Values less than 90 skew the curve towards
    the start point and values greater than 90 skew the curve
    towards the end point.}

\item{ncp}{The number of control points used to draw the curve.
    More control points creates a smoother curve.}

\item{box.padding}{Amount of padding around bounding box, as unit or number.
Defaults to 0.25. (Default unit is lines, but other units can be specified
by passing \code{unit(x, "units")}).}

\item{label.padding}{Amount of padding around label, as unit or number.
Defaults to 0.25. (Default unit is lines, but other units can be specified
by passing \code{unit(x, "units")}).}

\item{point.padding}{Amount of padding around labeled point, as unit or
number. Defaults to 0. (Default unit is lines, but other units can be
specified by passing \code{unit(x, "units")}).}

\item{label.r}{Radius of rounded corners, as unit or number. Defaults
to 0.15. (Default unit is lines, but other units can be specified by
passing \code{unit(x, "units")}).}

\item{label.size}{Size of label border, in mm.}

\item{min.segment.length}{Skip drawing segments shorter than this, as unit or
number. Defaults to 0.5. (Default unit is lines, but other units can be
specified by passing \code{unit(x, "units")}).}

\item{arrow}{specification for arrow heads, as created by \code{\link[grid]{arrow}}}

\item{force}{Force of repulsion between overlapping text labels. Defaults
to 1.}

\item{force_pull}{Force of attraction between a text label and its
corresponding data point. Defaults to 1.}

\item{max.time}{Maximum number of seconds to try to resolve overlaps.
Defaults to 0.5.}

\item{max.iter}{Maximum number of iterations to try to resolve overlaps.
Defaults to 10000.}

\item{max.overlaps}{Exclude text labels when they overlap too many other
things. For each text label, we count how many other text labels or other
data points it overlaps, and exclude the text label if it has too many overlaps.
Defaults to 10.}

\item{nudge_x, nudge_y}{Horizontal and vertical adjustments to nudge the
starting position of each text label. The units for \code{nudge_x} and
\code{nudge_y} are the same as for the data units on the x-axis and y-axis.}

\item{xlim, ylim}{Limits for the x and y axes. Text labels will be constrained
to these limits. By default, text labels are constrained to the entire plot
area.}

\item{na.rm}{If \code{FALSE} (the default), removes missing values with
a warning.  If \code{TRUE} silently removes missing values.}

\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped.
\code{FALSE} never includes, and \code{TRUE} always includes.}

\item{direction}{"both", "x", or "y" -- direction in which to adjust position of labels}

\item{seed}{Random seed passed to \code{\link[base]{set.seed}}. Defaults to
\code{NA}, which means that \code{set.seed} will not be called.}

\item{verbose}{If \code{TRUE}, some diagnostics of the repel algorithm are printed}

\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
rather than combining with them. This is most useful for helper functions
that define both data and aesthetics and shouldn't inherit behaviour from
the default plot specification, e.g. \code{\link[ggplot2]{borders}}.}
}
\value{
the return value of a \verb{geom_*} function is not intended to be
directly handled by users. Therefore, \verb{geom_*} functions should never be
executed in isolation, rather used in combination with a
\code{ggplot2::ggplot()} call.
}
\description{
\code{geom_junction_label_repel()} labels junction curves at their midpoint using
\code{ggrepel::geom_label_repel()}. This can be useful to label and compare
junctions (plotted using \code{geom_junction()}) with metrics of their usage (e.g.
read counts or percent-spliced-in).
}
\details{
\code{geom_junction_label_repel()} requires the following \code{aes()}; \code{xstart},
\code{xend}, \code{y} (e.g. transcript name) and \code{label}. Under the hood,
\code{geom_junction_label_repel()} generates the same junction curves as
\code{geom_junction()} to obtain curve midpoints for labeling. Therefore, it is
important that users use the same input data and parameters that alter
junction curves (namely \code{junction.orientation}, \code{junction.y.max}, \code{angle},
\code{ncp}) for \code{geom_junction_label_repel()} that they have used for
\code{geom_junction()}.
}
\examples{
library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()

# as well as a set of example (unannotated) junctions
# originating from GTEx and downloaded via the Bioconductor package snapcount
sod1_junctions

# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(
    type == "exon",
    transcript_name == "SOD1-201"
)
sod1_exons \%>\% head()

# add transcript_name to junctions for plotting
sod1_junctions <- sod1_junctions \%>\%
    dplyr::mutate(transcript_name = "SOD1-201")

# geom_junction_label_repel() can be used to label junctions
base <- sod1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name")
    )

# this can be useful to label junctions with their counts
base +
    geom_junction(
        data = sod1_junctions,
        junction.y.max = 0.5
    ) +
    geom_junction_label_repel(
        data = sod1_junctions,
        aes(label = round(mean_count, 2)),
        junction.y.max = 0.5
    )
}


================================================
FILE: man/geom_range.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/geom_range.R, R/geom_half_range.R
\name{geom_range}
\alias{geom_range}
\alias{geom_half_range}
\title{Plot genomic ranges}
\usage{
geom_range(
  mapping = NULL,
  data = NULL,
  stat = "identity",
  position = "identity",
  ...,
  vjust = NULL,
  linejoin = "mitre",
  na.rm = FALSE,
  show.legend = NA,
  inherit.aes = TRUE
)

geom_half_range(
  mapping = NULL,
  data = NULL,
  stat = "identity",
  position = "identity",
  ...,
  range.orientation = "bottom",
  linejoin = "mitre",
  na.rm = FALSE,
  show.legend = NA,
  inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}}. If specified and
\code{inherit.aes = TRUE} (the default), it is combined with the default mapping
at the top level of the plot. You must supply \code{mapping} if there is no plot
mapping.}

\item{data}{The data to be displayed in this layer. There are three
options:

If \code{NULL}, the default, the data is inherited from the plot
data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}.

A \code{data.frame}, or other object, will override the plot
data. All objects will be fortified to produce a data frame. See
\code{\link[ggplot2:fortify]{fortify()}} for which variables will be created.

A \code{function} will be called with a single argument,
the plot data. The return value must be a \code{data.frame}, and
will be used as the layer data. A \code{function} can be created
from a \code{formula} (e.g. \code{~ head(.x, 10)}).}

\item{stat}{The statistical transformation to use on the data for this layer.
When using a \verb{geom_*()} function to construct a layer, the \code{stat}
argument can be used the override the default coupling between geoms and
stats. The \code{stat} argument accepts the following:
\itemize{
\item A \code{Stat} ggproto subclass, for example \code{StatCount}.
\item A string naming the stat. To give the stat as a string, strip the
function name of the \code{stat_} prefix. For example, to use \code{stat_count()},
give the stat as \code{"count"}.
\item For more information and other ways to specify the stat, see the
\link[ggplot2:layer_stats]{layer stat} documentation.
}}

\item{position}{A position adjustment to use on the data for this layer. This
can be used in various ways, including to prevent overplotting and
improving the display. The \code{position} argument accepts the following:
\itemize{
\item The result of calling a position function, such as \code{position_jitter()}.
This method allows for passing extra arguments to the position.
\item A string naming the position adjustment. To give the position as a
string, strip the function name of the \code{position_} prefix. For example,
to use \code{position_jitter()}, give the position as \code{"jitter"}.
\item For more information and other ways to specify the position, see the
\link[ggplot2:layer_positions]{layer position} documentation.
}}

\item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}'s \code{params} argument. These
arguments broadly fall into one of 4 categories below. Notably, further
arguments to the \code{position} argument, or aesthetics that are required
can \emph{not} be passed through \code{...}. Unknown arguments that are not part
of the 4 categories below are ignored.
\itemize{
\item Static aesthetics that are not mapped to a scale, but are at a fixed
value and apply to the layer as a whole. For example, \code{colour = "red"}
or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics}
section that lists the available options. The 'required' aesthetics
cannot be passed on to the \code{params}. Please note that while passing
unmapped aesthetics as vectors is technically possible, the order and
required length is not guaranteed to be parallel to the input data.
\item When constructing a layer using
a \verb{stat_*()} function, the \code{...} argument can be used to pass on
parameters to the \code{geom} part of the layer. An example of this is
\code{stat_density(geom = "area", outline.type = "both")}. The geom's
documentation lists which parameters it can accept.
\item Inversely, when constructing a layer using a
\verb{geom_*()} function, the \code{...} argument can be used to pass on parameters
to the \code{stat} part of the layer. An example of this is
\code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation
lists which parameters it can accept.
\item The \code{key_glyph} argument of \code{\link[ggplot2:layer]{layer()}} may also be passed on through
\code{...}. This can be one of the functions described as
\link[ggplot2:draw_key]{key glyphs}, to change the display of the layer in the legend.
}}

\item{vjust}{A numeric vector specifying vertical justification.
    If specified, overrides the \code{just} setting.}

\item{linejoin}{Line join style (round, mitre, bevel).}

\item{na.rm}{If \code{FALSE}, the default, missing values are removed with
a warning. If \code{TRUE}, missing values are silently removed.}

\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped.
\code{FALSE} never includes, and \code{TRUE} always includes.
It can also be a named logical vector to finely select the aesthetics to
display.}

\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
rather than combining with them. This is most useful for helper functions
that define both data and aesthetics and shouldn't inherit behaviour from
the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.}

\item{range.orientation}{\code{character()} one of "top" or "bottom", specifying
where the half ranges will be plotted with respect to each transcript
(\code{y}).}
}
\value{
the return value of a \verb{geom_*} function is not intended to be
directly handled by users. Therefore, \verb{geom_*} functions should never be
executed in isolation, rather used in combination with a
\code{ggplot2::ggplot()} call.
}
\description{
\code{geom_range()} and \code{geom_half_range()} draw tiles that are designed to
represent range-based genomic features, such as exons. In combination with
\code{geom_intron()}, these geoms form the core components for visualizing
transcript structures.
}
\details{
\code{geom_range()} and \code{geom_half_range()} require the following \code{aes()};
\code{xstart}, \code{xend} and \code{y} (e.g. transcript name). \code{geom_half_range()} takes
advantage of the vertical symmetry of transcript annotation by plotting only
half of a range on the top or bottom of a transcript structure. This can be
useful for comparing between two transcripts or free up plotting space for
other transcript annotations (e.g. \code{geom_junction()}).
}
\examples{

library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()

# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon")
sod1_exons \%>\% head()

base <- sod1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    ))

# geom_range() is designed to visualise range-based annotation such as exons
base + geom_range()

# geom_half_range() allows users to plot half ranges
# on the top or bottom of the transcript
base + geom_half_range()

# where the half ranges are plotted can be adjusted using range.orientation
base + geom_half_range(range.orientation = "top")

# as a ggplot2 extension, ggtranscript geoms inherit the
# the functionality from the parameters and aesthetics in ggplot2
base + geom_range(
    aes(fill = transcript_name),
    linewidth = 1
)

# together, geom_range() and geom_intron() are designed to visualize
# the core components of transcript annotation
base + geom_range(
    aes(fill = transcript_biotype)
) +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name")
    )

# for protein coding transcripts
# geom_range() be useful for visualizing UTRs that lie outside of the CDS
sod1_exons_prot_coding <- sod1_exons \%>\%
    dplyr::filter(transcript_biotype == "protein_coding")

# extract cds
sod1_cds <- sod1_annotation \%>\%
    dplyr::filter(type == "CDS")

sod1_exons_prot_coding \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range(
        fill = "white",
        height = 0.25
    ) +
    geom_range(
        data = sod1_cds
    ) +
    geom_intron(
        data = to_intron(sod1_exons_prot_coding, "transcript_name")
    )

# geom_half_range() can be useful for comparing between two transcripts
# enabling visualization of one transcript on the top, other on the bottom
sod1_201_exons <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-201")
sod1_201_cds <- sod1_cds \%>\% dplyr::filter(transcript_name == "SOD1-201")
sod1_202_exons <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-202")
sod1_202_cds <- sod1_cds \%>\% dplyr::filter(transcript_name == "SOD1-202")

sod1_201_plot <- sod1_201_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = "SOD1-201/202"
    )) +
    geom_half_range(
        fill = "white",
        height = 0.125
    ) +
    geom_half_range(
        data = sod1_201_cds
    ) +
    geom_intron(
        data = to_intron(sod1_201_exons, "transcript_name")
    )

sod1_201_plot

sod1_201_202_plot <- sod1_201_plot +
    geom_half_range(
        data = sod1_202_exons,
        range.orientation = "top",
        fill = "white",
        height = 0.125
    ) +
    geom_half_range(
        data = sod1_202_cds,
        range.orientation = "top",
        fill = "purple"
    ) +
    geom_intron(
        data = to_intron(sod1_202_exons, "transcript_name")
    )

sod1_201_202_plot

# leveraging existing ggplot2 functionality via e.g. coord_cartesian()
# can be useful to zoom in on areas of interest
sod1_201_202_plot + coord_cartesian(xlim = c(31659500, 31660000))
}


================================================
FILE: man/ggtranscript.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ggtranscript-package.R
\docType{package}
\name{ggtranscript}
\alias{ggtranscript-package}
\alias{ggtranscript}
\title{\code{ggtranscript}: Visualizing transcript structure and annotation using
\code{ggplot2}}
\description{
The goal of \code{ggtranscript} is the simplify the process of visualizing
transcript structure and annotation. To achieve this, \code{ggtranscript}
introduces 5 new geoms (\code{geom_range()}, \code{geom_half_range()}, \code{geom_intron()},
\code{geom_junction()} and \code{geom_junction_label_repel()}) as well as several
helper functions. As a \code{ggplot2} extension, \code{ggtranscript} inherits
\code{ggplot2}'s familiarity and flexibility, enabling users to intuitively adjust
aesthetics, parameters, scales etc as well as complement \code{ggtranscript} geoms
with existing \code{ggplot2} geoms to create informative, publication-ready plots.
}
\seealso{
Useful links:
\itemize{
  \item \url{https://github.com/dzhang32/ggtranscript}
  \item Report bugs at \url{https://github.com/dzhang32/ggtranscript/issues}
}

}
\author{
\strong{Maintainer}: David Zhang \email{dyzhang32@gmail.com} (\href{https://orcid.org/0000-0003-2382-8460}{ORCID})

Authors:
\itemize{
  \item Emil Gustavsson \email{e.gustavsson@ucl.ac.uk} (\href{https://orcid.org/0000-0003-0541-7537}{ORCID})
}

Other contributors:
\itemize{
  \item Regina Reynolds \email{regina.reynolds.16@ucl.ac.uk} (\href{https://orcid.org/0000-0001-6470-7919}{ORCID}) [contributor]
  \item Sonia Ruiz \email{s.ruiz@ucl.ac.uk} [contributor]
}

}


================================================
FILE: man/shorten_gaps.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/shorten_gaps.R
\name{shorten_gaps}
\alias{shorten_gaps}
\title{Improve transcript structure visualization by shortening gaps}
\usage{
shorten_gaps(exons, introns, group_var = NULL, target_gap_width = 100L)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}

\item{introns}{\code{data.frame()} the intron co-ordinates corresponding to the
input \code{exons}. This can be created by applying \code{to_intron()} to the
\code{exons}. If introns originate from multiple transcripts, they must be
differentiated using \code{group_var}. If a user is not using \code{to_intron()},
they must make sure intron start/ends are defined precisely as the adjacent
exon boundaries (rather than exon end + 1 and exon start - 1).}

\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}

\item{target_gap_width}{\code{integer()} the width in base pairs to shorten the
gaps to.}
}
\value{
\code{data.frame()} contains the re-scaled co-ordinates of \code{introns} and
\code{exons} of each input transcript with shortened gaps.
}
\description{
For a given set of exons and introns, \code{shorten_gaps()} reduces the width of
gaps (regions that do not overlap any \code{exons}) to a user-inputted
\code{target_gap_width}. This can be useful when visualizing transcripts that have
long introns, to hone in on the regions of interest (i.e. exons) and better
compare between transcript structures.
}
\details{
After \code{shorten_gaps()} reduces the size of gaps, it will re-scale \code{exons} and
\code{introns} to preserve exon alignment. This process will only reduce the width
of input \code{introns}, never \code{exons}. Importantly, the outputted re-scaled
co-ordinates should only be used for visualization as they will not match the
original genomic coordinates.
}
\examples{

library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
pknox1_annotation \%>\% head()

# extract exons
pknox1_exons <- pknox1_annotation \%>\% dplyr::filter(type == "exon")
pknox1_exons \%>\% head()

# to_intron() is a helper function included in ggtranscript
# which is useful for converting exon co-ordinates to introns
pknox1_introns <- pknox1_exons \%>\% to_intron(group_var = "transcript_name")
pknox1_introns \%>\% head()

# for transcripts with long introns, the exons of interest
# can be difficult to visualize clearly when using the default scale
pknox1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = pknox1_introns,
        arrow.min.intron.length = 3500
    )

# in such cases it can be useful to rescale the exons and introns
# using shorten_gaps() which shortens regions that do not overlap an exon
pknox1_rescaled <-
    shorten_gaps(pknox1_exons, pknox1_introns, group_var = "transcript_name")

pknox1_rescaled \%>\% head()

# this allows us to visualize differences in exonic structure more clearly
pknox1_rescaled \%>\%
    dplyr::filter(type == "exon") \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = pknox1_rescaled \%>\% dplyr::filter(type == "intron"),
        arrow.min.intron.length = 300
    )

# shorten_gaps() can be used in combination with to_diff()
# to further highlight differences in exon structure
# here, all other transcripts are compared to the MANE-select transcript
pknox1_rescaled_diffs <- to_diff(
    exons = pknox1_rescaled \%>\%
        dplyr::filter(type == "exon", transcript_name != "PKNOX1-201"),
    ref_exons = pknox1_rescaled \%>\%
        dplyr::filter(type == "exon", transcript_name == "PKNOX1-201"),
    group_var = "transcript_name"
)

pknox1_rescaled \%>\%
    dplyr::filter(type == "exon") \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = pknox1_rescaled \%>\% dplyr::filter(type == "intron"),
        arrow.min.intron.length = 300
    ) +
    geom_range(
        data = pknox1_rescaled_diffs,
        aes(fill = diff_type),
        alpha = 0.2
    )
}


================================================
FILE: man/sod1_annotation.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{sod1_annotation}
\alias{sod1_annotation}
\alias{pknox1_annotation}
\title{Example transcript annotation}
\format{
A \code{tibble::tibble()}:
\describe{
\item{seqnames}{\code{factor()} chromosome.}
\item{start}{\code{integer()} start position.}
\item{end}{\code{integer()} end position.}
\item{strand}{\code{factor()} strand.}
\item{type}{\code{factor()} E.g.gene, transcript, exon or CDS.}
\item{gene_name}{\code{character()} name of gene (GBA).}
\item{transcript_name}{\code{character()} name of transcript.}
\item{transcript_biotype}{\code{character()} biotype of transcript.}
}

An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 112 rows and 8 columns.
}
\source{
generated using \code{ggtranscript/data-raw/sod1_pknox1_annotation.R}
}
\usage{
sod1_annotation

pknox1_annotation
}
\description{
Transcript annotation including the co-ordinates (hg38) of the genes,
transcripts, exons and CDS regions for \emph{SOD1} and \emph{PKNOX1}, which
originate from version 105 of the Ensembl reference annotation.
}
\keyword{datasets}


================================================
FILE: man/sod1_junctions.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{sod1_junctions}
\alias{sod1_junctions}
\title{Example junctions}
\format{
A \code{tibble::tibble()}:
\describe{
\item{seqnames}{\code{factor()} chromosome.}
\item{start}{\code{integer()} start position.}
\item{end}{\code{integer()} end position.}
\item{strand}{\code{factor()} strand.}
\item{mean_count}{\code{factor()} Average count across all GTEx liver samples.}
}
}
\source{
generated using \code{ggtranscript/data-raw/sod1_junctions.R}
}
\usage{
sod1_junctions
}
\description{
Junction co-ordinates and counts associated with the \emph{SOD1} gene.
Junctions counts originate from GTEx liver samples and are downloaded via the
Bioconductor package \code{snapcount}. Only unannotated junctions with a mean
count above 0.3 have been retained for this example.
}
\keyword{datasets}


================================================
FILE: man/to_diff.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/to_diff.R
\name{to_diff}
\alias{to_diff}
\title{Obtain the differences between transcript structure}
\usage{
to_diff(exons, ref_exons, group_var = NULL)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}

\item{ref_exons}{\code{data.frame()} contains exons that originate from a single
transcript, which \code{exons} will be compared against.}

\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
}
\value{
\code{data.frame()} details the differences between \code{exons} and
\code{ref_exons}.
}
\description{
\code{to_diff()} obtains the difference between \code{exons} from a set of transcripts
to a reference transcript (\code{ref_exons}). This can be useful when visualizing
the differences between transcript structure. \code{to_diff()} expects two sets of
input exons; 1. \code{exons} - exons from any number of transcripts that will be
compared to \code{ref_exons} and 2. \code{ref_exons} - exons from a single transcript
which acts as the reference to compare against.
}
\examples{

library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()

# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon")
sod1_exons \%>\% head()

# for this example, let's compare transcripts to the MANE-select transcript
sod1_mane <- sod1_exons \%>\% dplyr::filter(transcript_name == "SOD1-201")
sod1_not_mane <- sod1_exons \%>\% dplyr::filter(transcript_name != "SOD1-201")

# to_diff() obtains the differences between the exons as ranges
sod1_diffs <- to_diff(
    exons = sod1_not_mane,
    ref_exons = sod1_mane,
    group_var = "transcript_name"
)

sod1_diffs \%>\% head()

# using geom_range(), it can be useful to visually overlay
# the differences on top of the transcript annotation
sod1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name")
    ) +
    geom_range(
        data = sod1_diffs,
        ggplot2::aes(fill = diff_type),
        alpha = 0.2
    )
}


================================================
FILE: man/to_intron.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/to_intron.R
\name{to_intron}
\alias{to_intron}
\title{Convert exon co-ordinates to introns}
\usage{
to_intron(exons, group_var = NULL)
}
\arguments{
\item{exons}{\code{data.frame()} contains exons which can originate from multiple
transcripts differentiated by \code{group_var}.}

\item{group_var}{\code{character()} if input data originates from more than 1
transcript, \code{group_var} must specify the column that differentiates
transcripts (e.g. "transcript_id").}
}
\value{
\code{data.frame()} contains the intron co-ordinates.
}
\description{
Given a set of \code{exons}, \code{to_intron()} will return the corresponding introns.
}
\details{
It is important to note that, for visualization purposes, \code{to_intron()}
defines introns precisely as the exon boundaries, rather than the intron
start/end being (exon end + 1)/(exon start - 1).
}
\examples{
library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
sod1_annotation \%>\% head()

# extract exons
sod1_exons <- sod1_annotation \%>\% dplyr::filter(type == "exon")
sod1_exons \%>\% head()

# to_intron() is a helper function included in ggtranscript
# which is useful for converting exon co-ordinates to introns
sod1_introns <- sod1_exons \%>\% to_intron(group_var = "transcript_name")
sod1_introns \%>\% head()

# this can be particular useful when combined with
# geom_range() and geom_intron()
# to visualize the core components of transcript annotation
sod1_exons \%>\%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_intron(
        data = to_intron(sod1_exons, "transcript_name")
    )
}


================================================
FILE: tests/testthat/test-add_exon_number.R
================================================
sod1_exons <- sod1_annotation %>%
    dplyr::filter(type == "exon")

# create dummy transcripts with both positive and minus strand
# purely for testing strand functionality
test_exons <- sod1_exons %>%
    dplyr::filter(transcript_name == "SOD1-202") %>%
    dplyr::mutate(strand = "-") %>%
    dplyr::bind_rows(
        sod1_exons %>%
            dplyr::filter(transcript_name == "SOD1-201")
    )

##### add_exon_number #####

testthat::test_that("add_exon_number() works correctly", {
    test_exon_number <- test_exons %>%
        add_exon_number(group_var = "transcript_name")

    test_exon_number_plus <- test_exon_number %>%
        dplyr::filter(strand == "+")
    test_exon_number_minus <- test_exon_number %>%
        dplyr::filter(strand == "-")

    expect_true("exon_number" %in% colnames(test_exon_number))
    expect_true(is.numeric(test_exon_number[["exon_number"]]))

    expect_equal(
        test_exon_number_plus[["exon_number"]],
        seq_len(nrow(test_exon_number_plus))
    )
    expect_equal(
        test_exon_number_minus[["exon_number"]],
        seq_len(nrow(test_exon_number_minus)) %>% rev()
    )

    # check order makes no difference
    set.seed(32)
    expect_equal(
        test_exons[sample(seq_len(nrow(test_exons)), nrow(test_exons)), ] %>%
            add_exon_number(group_var = "transcript_name"),
        test_exon_number
    )
})

testthat::test_that("add_exon_number(group_var = NULL) works correctly", {
    test_exon_number_plus <- test_exons %>%
        dplyr::filter(strand == "+") %>%
        add_exon_number(group_var = NULL)
    test_exon_number_minus <- test_exons %>%
        dplyr::filter(strand == "-") %>%
        add_exon_number(group_var = NULL)

    expect_equal(
        test_exon_number_plus[["exon_number"]],
        seq_len(nrow(test_exon_number_plus))
    )
    expect_equal(
        test_exon_number_minus[["exon_number"]],
        seq_len(nrow(test_exon_number_minus)) %>% rev()
    )
})


================================================
FILE: tests/testthat/test-add_utr.R
================================================
pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
pknox1_cds <- pknox1_annotation %>% dplyr::filter(type == "CDS")
pknox1_utr <- pknox1_annotation %>% dplyr::filter(grepl("utr", type))

##### add_utr #####

# add 3 bp to end of cds as stop codon not included in ensembl cds
pknox1_cds_w_stop <- pknox1_cds %>%
    dplyr::group_by(transcript_name) %>%
    dplyr::mutate(
        end = ifelse(end == max(end), end + 3, end)
    ) %>%
    dplyr::ungroup()

pknox1_cds_utr <- add_utr(
    pknox1_exons,
    pknox1_cds_w_stop,
    group_var = "transcript_name"
)

pknox1_cds_utr_1_tx <- add_utr(
    pknox1_exons %>% dplyr::filter(transcript_name == "PKNOX1-203"),
    pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203"),
    group_var = "transcript_name"
)

pknox1_cds_utr_1_tx_no_group <- add_utr(
    pknox1_exons %>% dplyr::filter(transcript_name == "PKNOX1-203"),
    pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203"),
    group_var = NULL
)

test_add_utrs <- function(cds_utr_add_utr, utr_annotation, cds_annotation) {
    utr_add_utr <- cds_utr_add_utr %>%
        dplyr::filter(type == "UTR") %>%
        dplyr::select(start, end) %>%
        dplyr::arrange(start, end)

    cds_add_utr <- cds_utr_add_utr %>%
        dplyr::filter(type == "CDS") %>%
        dplyr::select(start, end) %>%
        dplyr::arrange(start, end)

    utr_annotation <- utr_annotation %>%
        dplyr::select(start, end) %>%
        dplyr::arrange(start, end)

    cds_annotation <- cds_annotation %>%
        dplyr::select(start, end) %>%
        dplyr::arrange(start, end)

    no_na_type <- all(!is.na(cds_utr_add_utr[["type"]]))
    no_dummy_group <- is.null(cds_utr_add_utr[["dummy_group"]])
    correct_utrs <- all.equal(utr_add_utr, utr_annotation)
    correct_cds <- all.equal(cds_add_utr, cds_annotation)

    check_add_utr <- all(no_na_type, no_dummy_group, correct_utrs, correct_cds)

    return(check_add_utr)
}

testthat::test_that(
    "add_utr() works correctly",
    {
        expect_true(test_add_utrs(pknox1_cds_utr, pknox1_utr, pknox1_cds_w_stop))
        expect_true(test_add_utrs(
            pknox1_cds_utr_1_tx,
            pknox1_utr %>% dplyr::filter(transcript_name == "PKNOX1-203"),
            pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203")
        ))
        expect_true(test_add_utrs(
            pknox1_cds_utr_1_tx_no_group,
            pknox1_utr %>% dplyr::filter(transcript_name == "PKNOX1-203"),
            pknox1_cds_w_stop %>% dplyr::filter(transcript_name == "PKNOX1-203")
        ))
    }
)

##### add_utr & shorten_gaps #####

pknox1_cds_utr_rescaled <-
    shorten_gaps(
        exons = pknox1_cds_utr,
        introns = to_intron(pknox1_cds_utr, "transcript_name"),
        group_var = "transcript_name"
    )

# add labels helps manual checking
plot_before_after_rescaled <- function(cds_utr_before,
                                       cds_utr_after,
                                       group_var,
                                       add_labels = FALSE) {
    before_rescaling <- cds_utr_before %>%
        dplyr::filter(type == "CDS") %>%
        ggplot2::ggplot(ggplot2::aes(
            xstart = start,
            xend = end,
            y = .data[[group_var]]
        )) +
        geom_range() +
        geom_range(
            data = cds_utr_before %>% dplyr::filter(type == "UTR"),
            height = 0.25,
            fill = "white"
        ) +
        geom_intron(
            data = to_intron(cds_utr_before, "transcript_name"),
        )

    after_rescaling <- cds_utr_after %>%
        dplyr::filter(type == "CDS") %>%
        ggplot2::ggplot(ggplot2::aes(
            xstart = start,
            xend = end,
            y = .data[[group_var]]
        )) +
        geom_range() +
        geom_range(
            data = cds_utr_after %>% dplyr::filter(type == "UTR"),
            height = 0.25,
            fill = "white"
        ) +
        geom_intron(
            data = to_intron(
                cds_utr_after %>%
                    dplyr::filter(type != "intron"),
                "transcript_name"
            ),
        )

    before_after_list <- list(before_rescaling, after_rescaling)

    if (add_labels) {
        for (i in seq_len(length(before_after_list))) {
            before_after_list[[i]] <- before_after_list[[i]] +
                ggrepel::geom_label_repel(
                    ggplot2::aes_string(
                        x = "start",
                        y = group_var,
                        label = "start"
                    ),
                    min.segment.length = 0
                )
        }
    }

    before_after_plot <- ggpubr::ggarrange(
        plotlist = before_after_list, nrow = 2
    )

    return(before_after_plot)
}

testthat::test_that(
    "shorten_gaps works correctly",
    {
        test_rescaled_w_utr_plot <- plot_before_after_rescaled(
            pknox1_cds_utr,
            pknox1_cds_utr_rescaled,
            group_var = "transcript_name",
            add_labels = FALSE
        )

        vdiffr::expect_doppelganger(
            "test rescaled with utr plot",
            test_rescaled_w_utr_plot
        )
    }
)


================================================
FILE: tests/testthat/test-geom_half_range.R
================================================
# create dummy exons for testing
test_exons <-
    dplyr::tibble(
        start = c(100, 300, 500, 650),
        end = start + 100,
        strand = c("+", "+", "-", "-"),
        tx = c("A", "A", "B", "B")
    )

# create base plot to be used in downstream tests
test_exons_plot <- test_exons %>%
    ggplot2::ggplot(aes(
        xstart = start,
        xend = end,
        y = tx
    ))

##### geom_half_range #####

testthat::test_that(
    "geom_half_range() works correctly",
    {
        base_geom_half_range <- test_exons_plot +
            geom_half_range()
        w_param_geom_half_range <- test_exons_plot +
            geom_half_range(colour = "red", fill = "blue")
        w_aes_geom_half_range <- test_exons_plot +
            geom_half_range(aes(fill = tx))
        w_facet_geom_half_range <- test_exons_plot +
            geom_half_range() +
            ggplot2::facet_wrap(~tx)

        vdiffr::expect_doppelganger(
            "Base geom_half_range plot",
            base_geom_half_range
        )
        vdiffr::expect_doppelganger(
            "With param geom_half_range plot",
            w_param_geom_half_range
        )
        vdiffr::expect_doppelganger(
            "With aes geom_half_range plot",
            w_aes_geom_half_range
        )
        vdiffr::expect_doppelganger(
            "With facet geom_half_range plot",
            w_facet_geom_half_range
        )
    }
)

testthat::test_that(
    "geom_half_range(range.orientation = x) works correctly",
    {
        w_top_geom_half_range <- test_exons_plot +
            geom_half_range(range.orientation = "top")
        w_both_geom_half_range <- test_exons_plot +
            geom_half_range(range.orientation = "top", fill = "red") +
            geom_half_range(range.orientation = "bottom", fill = "blue")

        vdiffr::expect_doppelganger(
            "With top geom_half_range plot",
            w_top_geom_half_range
        )
        vdiffr::expect_doppelganger(
            "With both geom_half_range plot",
            w_both_geom_half_range
        )
    }
)

testthat::test_that(
    "geom_half_range() catches user input errors",
    {
        a_range.orientation <- test_exons_plot +
            geom_half_range(range.orientation = "a")

        expect_error(
            print(a_range.orientation),
            "range.orientation must be one of"
        )
    }
)


================================================
FILE: tests/testthat/test-geom_intron.R
================================================
test_introns <-
    dplyr::tibble(
        strand = c("+", "-"),
        tx = c("A", "B"),
        start = c(201, 601),
        end = c(299, 649),
        type = "intron"
    )

# create base plot to be used in downstream tests
test_introns_plot <- test_introns %>%
    ggplot2::ggplot(aes(
        xstart = start,
        xend = end,
        y = tx
    ))

##### geom_intron #####

testthat::test_that(
    "geom_intron() works correctly",
    {
        base_geom_intron <- test_introns_plot +
            geom_intron()
        w_param_geom_intron <- test_introns_plot +
            geom_intron(colour = "blue", linewidth = 2)
        w_aes_geom_intron <- test_introns_plot +
            geom_intron(aes(colour = tx, linewidth = c(1L, 2L)))
        w_facet_geom_intron <- test_introns_plot +
            geom_intron() +
            ggplot2::facet_wrap(~tx)

        vdiffr::expect_doppelganger(
            "Base geom_intron plot",
            base_geom_intron
        )
        vdiffr::expect_doppelganger(
            "With param geom_intron plot",
            w_param_geom_intron
        )
        vdiffr::expect_doppelganger(
            "With aes geom_intron plot",
            w_aes_geom_intron
        )
        vdiffr::expect_doppelganger(
            "With facet geom_intron plot",
            w_facet_geom_intron
        )
    }
)

testthat::test_that(
    "geom_intron(strand = x) works correctly",
    {
        minus_strand <- test_introns_plot +
            geom_intron(strand = "-")
        factor_strand <- test_introns_plot +
            geom_intron(strand = factor("-"))
        as_aes_strand <- test_introns_plot +
            geom_intron(aes(strand = strand))

        vdiffr::expect_doppelganger(
            "Minus strand plot",
            minus_strand
        )
        vdiffr::expect_doppelganger(
            "factor strand plot",
            factor_strand
        )
        vdiffr::expect_doppelganger(
            "As aes strand plot",
            as_aes_strand
        )
    }
)

testthat::test_that(
    "geom_intron(arrow.min.intron.length = x) works correctly",
    {
        base_arrow.min <- test_introns_plot +
            geom_intron(arrow.min.intron.length = 50)
        w_strand_arrow_min <- test_introns_plot +
            geom_intron(arrow.min.intron.length = 50, strand = "-")

        vdiffr::expect_doppelganger(
            "base arrow.min plot",
            base_arrow.min
        )
        vdiffr::expect_doppelganger(
            "with strand arrow.min plot",
            w_strand_arrow_min
        )
    }
)

testthat::test_that(
    "geom_intron() catches strand input errors",
    {
        na_strand <- test_introns_plot +
            geom_intron(strand = c(NA, rep("+", nrow(test_introns) - 1)))
        a_strand <- test_introns_plot +
            geom_intron(strand = "a")
        int_strand <- test_introns_plot +
            geom_intron(aes(strand = start))
        # seems to require print to catch error
        expect_error(
            print(na_strand),
            "strand values must be one of"
        )
        expect_error(
            print(a_strand),
            "strand values must be one of"
        )
        expect_error(
            print(int_strand),
            "strand values must be one of"
        )
    }
)

testthat::test_that(
    "geom_intron() catches arrow.min.intron.length input errors",
    {
        neg_arrow.min <- test_introns_plot +
            geom_intron(arrow.min.intron.length = -1)
        chr_arrow.min <- test_introns_plot +
            geom_intron(arrow.min.intron.length = "1")
        # seems to require print to catch error
        expect_error(
            print(neg_arrow.min),
            "arrow.min.intron.length must be "
        )
        expect_error(
            print(chr_arrow.min),
            "arrow.min.intron.length must be "
        )
    }
)


================================================
FILE: tests/testth
Download .txt
gitextract_nqf_jr3r/

├── .Rbuildignore
├── .github/
│   ├── .gitignore
│   └── workflows/
│       └── check-bioc.yml
├── .gitignore
├── .pre-commit-config.yaml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│   ├── add_exon_number.R
│   ├── add_utr.R
│   ├── data.R
│   ├── geom_half_range.R
│   ├── geom_intron.R
│   ├── geom_junction.R
│   ├── geom_junction_label_repel.R
│   ├── geom_range.R
│   ├── ggtranscript-package.R
│   ├── globals.R
│   ├── shorten_gaps.R
│   ├── to_diff.R
│   ├── to_intron.R
│   └── utils.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── codecov.yml
├── data/
│   ├── pknox1_annotation.rda
│   ├── sod1_annotation.rda
│   └── sod1_junctions.rda
├── data-raw/
│   ├── ggplot2_exts_thumbnail.R
│   ├── ggtranscript_logo.R
│   ├── sod1_junctions.R
│   └── sod1_pknox1_annotation.R
├── inst/
│   └── CITATION
├── man/
│   ├── add_exon_number.Rd
│   ├── add_utr.Rd
│   ├── geom_intron.Rd
│   ├── geom_junction.Rd
│   ├── geom_junction_label_repel.Rd
│   ├── geom_range.Rd
│   ├── ggtranscript.Rd
│   ├── shorten_gaps.Rd
│   ├── sod1_annotation.Rd
│   ├── sod1_junctions.Rd
│   ├── to_diff.Rd
│   └── to_intron.Rd
├── tests/
│   ├── testthat/
│   │   ├── test-add_exon_number.R
│   │   ├── test-add_utr.R
│   │   ├── test-geom_half_range.R
│   │   ├── test-geom_intron.R
│   │   ├── test-geom_junction.R
│   │   ├── test-geom_junction_label_repel.R
│   │   ├── test-geom_range.R
│   │   ├── test-shorten_gaps.R
│   │   ├── test-to_diff.R
│   │   ├── test-to_intron.R
│   │   └── test-utils.R
│   └── testthat.R
└── vignettes/
    ├── .gitignore
    └── ggtranscript.Rmd
Condensed preview — 62 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (246K chars).
[
  {
    "path": ".Rbuildignore",
    "chars": 195,
    "preview": "^ggtranscript\\.Rproj$\n^\\.Rproj\\.user$\n^dev$\n^README\\.Rmd$\n^\\.github$\n^codecov\\.yml$\n^.pre-commit-config.yaml$\n^data-raw$"
  },
  {
    "path": ".github/.gitignore",
    "chars": 7,
    "preview": "*.html\n"
  },
  {
    "path": ".github/workflows/check-bioc.yml",
    "chars": 9493,
    "preview": "on:\n  push:\n  pull_request:\n\nname: R-CMD-check-bioc\n\n## These environment variables control whether to run GHA code late"
  },
  {
    "path": ".gitignore",
    "chars": 46,
    "preview": ".Rproj.user\ninst/doc\n*.DS_Store\n*.pdf\n*.Rproj\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "chars": 1103,
    "preview": "repos:\n-   repo: https://github.com/lorenzwalthert/precommit\n    rev: v0.1.3.9133\n    hooks:\n    -   id: readme-rmd-rend"
  },
  {
    "path": "DESCRIPTION",
    "chars": 2112,
    "preview": "Package: ggtranscript\nTitle: Visualizing Transcript Structure and Annotation using 'ggplot2'\nVersion: 1.0.0\nAuthors@R:\n "
  },
  {
    "path": "LICENSE",
    "chars": 50,
    "preview": "YEAR: 2022\nCOPYRIGHT HOLDER: ggtranscript authors\n"
  },
  {
    "path": "LICENSE.md",
    "chars": 1079,
    "preview": "# MIT License\n\nCopyright (c) 2022 ggtranscript authors\n\nPermission is hereby granted, free of charge, to any person obta"
  },
  {
    "path": "NAMESPACE",
    "chars": 352,
    "preview": "# Generated by roxygen2: do not edit by hand\n\nexport(add_exon_number)\nexport(add_utr)\nexport(geom_half_range)\nexport(geo"
  },
  {
    "path": "NEWS.md",
    "chars": 1762,
    "preview": "\n# ggtranscript 1.0.0\n\n## Fixes\n\n* Updates `geom`s to work with the latest version of `ggplot2` and `ggrepel` (R version"
  },
  {
    "path": "R/add_exon_number.R",
    "chars": 2950,
    "preview": "#' Add exon number\n#'\n#' `add_exon_number()` adds the exon number (the order the exons are transcribed\n#' within each tr"
  },
  {
    "path": "R/add_utr.R",
    "chars": 4670,
    "preview": "#' Add untranslated regions (UTRs)\n#'\n#' Given a set of `exons` (encompassing the CDS and UTRs) and `cds` regions,\n#' `a"
  },
  {
    "path": "R/data.R",
    "chars": 1591,
    "preview": "#' Example transcript annotation\n#'\n#' Transcript annotation including the co-ordinates (hg38) of the genes,\n#' transcri"
  },
  {
    "path": "R/geom_half_range.R",
    "chars": 2629,
    "preview": "#' @param range.orientation `character()` one of \"top\" or \"bottom\", specifying\n#'   where the half ranges will be plotte"
  },
  {
    "path": "R/geom_intron.R",
    "chars": 9970,
    "preview": "#' Plot intron lines with strand arrows\n#'\n#' `geom_intron()` draws horizontal lines with central arrows that are design"
  },
  {
    "path": "R/geom_junction.R",
    "chars": 11224,
    "preview": "#' Plot junction curves\n#'\n#' `geom_junction()` draws curves that are designed to represent junction reads\n#' from RNA-s"
  },
  {
    "path": "R/geom_junction_label_repel.R",
    "chars": 10907,
    "preview": "#' Label junction curves\n#'\n#' `geom_junction_label_repel()` labels junction curves at their midpoint using\n#' `ggrepel:"
  },
  {
    "path": "R/geom_range.R",
    "chars": 7426,
    "preview": "#' Plot genomic ranges\n#'\n#' `geom_range()` and `geom_half_range()` draw tiles that are designed to\n#' represent range-b"
  },
  {
    "path": "R/ggtranscript-package.R",
    "chars": 866,
    "preview": "#' `ggtranscript`: Visualizing transcript structure and annotation using\n#' `ggplot2`\n#'\n#' The goal of `ggtranscript` i"
  },
  {
    "path": "R/globals.R",
    "chars": 952,
    "preview": "# bypass R CMD Check notes, related to tidyverse non-standard evaluation\n# https://www.r-bloggers.com/2019/08/no-visible"
  },
  {
    "path": "R/shorten_gaps.R",
    "chars": 17394,
    "preview": "#' Improve transcript structure visualization by shortening gaps\n#'\n#' For a given set of exons and introns, `shorten_ga"
  },
  {
    "path": "R/to_diff.R",
    "chars": 5341,
    "preview": "#' Obtain the differences between transcript structure\n#'\n#' `to_diff()` obtains the difference between `exons` from a s"
  },
  {
    "path": "R/to_intron.R",
    "chars": 2512,
    "preview": "#' Convert exon co-ordinates to introns\n#'\n#' Given a set of `exons`, `to_intron()` will return the corresponding intron"
  },
  {
    "path": "R/utils.R",
    "chars": 1120,
    "preview": "#' @keywords internal\n#' @noRd\n.check_coord_object <- function(x,\n                                check_seqnames = FALSE"
  },
  {
    "path": "README.Rmd",
    "chars": 8428,
    "preview": "---\noutput: github_document\n---\n\n```{r, include = FALSE}\nknitr::opts_chunk$set(\n    collapse = TRUE,\n    comment = \"#>\","
  },
  {
    "path": "README.md",
    "chars": 10936,
    "preview": "\n# ggtranscript <img src=\"man/figures/ggtranscript_logo_cropped.svg\" align=\"right\" height=\"139\" />\n\n<!-- badges: start -"
  },
  {
    "path": "_pkgdown.yml",
    "chars": 45,
    "preview": "template:\n  bootstrap: 5\n  bootswatch: cosmo\n"
  },
  {
    "path": "codecov.yml",
    "chars": 232,
    "preview": "comment: false\n\ncoverage:\n  status:\n    project:\n      default:\n        target: auto\n        threshold: 1%\n        infor"
  },
  {
    "path": "data-raw/ggplot2_exts_thumbnail.R",
    "chars": 1622,
    "preview": "\n# Load libraries ----------------------------------------------------------\n\nlibrary(tidyverse)\ndevtools::load_all(\".\")"
  },
  {
    "path": "data-raw/ggtranscript_logo.R",
    "chars": 4275,
    "preview": "\n# Load libraries ----------------------------------------------------------\n\nlibrary(tidyverse)\nlibrary(hexSticker)\nlib"
  },
  {
    "path": "data-raw/sod1_junctions.R",
    "chars": 1353,
    "preview": "\n# Load libraries ----------------------------------------------------------\n\nlibrary(tidyverse)\nlibrary(snapcount)\nlibr"
  },
  {
    "path": "data-raw/sod1_pknox1_annotation.R",
    "chars": 1548,
    "preview": "\n# Load libraries ----------------------------------------------------------\n\nlibrary(tidyverse)\nlibrary(rtracklayer)\nli"
  },
  {
    "path": "inst/CITATION",
    "chars": 824,
    "preview": "pkgVer <- function(pkg) {\n    if (!exists(\"meta\") || is.null(meta)) meta <- packageDescription(pkg)\n    ver <- meta$Vers"
  },
  {
    "path": "man/add_exon_number.Rd",
    "chars": 2405,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/add_exon_number.R\n\\name{add_exon_number}\n\\"
  },
  {
    "path": "man/add_utr.Rd",
    "chars": 3269,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/add_utr.R\n\\name{add_utr}\n\\alias{add_utr}\n\\"
  },
  {
    "path": "man/geom_intron.Rd",
    "chars": 8743,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/geom_intron.R\n\\name{geom_intron}\n\\alias{ge"
  },
  {
    "path": "man/geom_junction.Rd",
    "chars": 10318,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/geom_junction.R\n\\name{geom_junction}\n\\alia"
  },
  {
    "path": "man/geom_junction_label_repel.Rd",
    "chars": 8213,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/geom_junction_label_repel.R\n\\name{geom_jun"
  },
  {
    "path": "man/geom_range.Rd",
    "chars": 10051,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/geom_range.R, R/geom_half_range.R\n\\name{ge"
  },
  {
    "path": "man/ggtranscript.Rd",
    "chars": 1603,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/ggtranscript-package.R\n\\docType{package}\n\\"
  },
  {
    "path": "man/shorten_gaps.Rd",
    "chars": 4479,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/shorten_gaps.R\n\\name{shorten_gaps}\n\\alias{"
  },
  {
    "path": "man/sod1_annotation.Rd",
    "chars": 1169,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data.R\n\\docType{data}\n\\name{sod1_annotatio"
  },
  {
    "path": "man/sod1_junctions.Rd",
    "chars": 889,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data.R\n\\docType{data}\n\\name{sod1_junctions"
  },
  {
    "path": "man/to_diff.Rd",
    "chars": 2442,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/to_diff.R\n\\name{to_diff}\n\\alias{to_diff}\n\\"
  },
  {
    "path": "man/to_intron.Rd",
    "chars": 1790,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/to_intron.R\n\\name{to_intron}\n\\alias{to_int"
  },
  {
    "path": "tests/testthat/test-add_exon_number.R",
    "chars": 1960,
    "preview": "sod1_exons <- sod1_annotation %>%\n    dplyr::filter(type == \"exon\")\n\n# create dummy transcripts with both positive and m"
  },
  {
    "path": "tests/testthat/test-add_utr.R",
    "chars": 5209,
    "preview": "pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == \"exon\")\npknox1_cds <- pknox1_annotation %>% dplyr::filter(ty"
  },
  {
    "path": "tests/testthat/test-geom_half_range.R",
    "chars": 2376,
    "preview": "# create dummy exons for testing\ntest_exons <-\n    dplyr::tibble(\n        start = c(100, 300, 500, 650),\n        end = s"
  },
  {
    "path": "tests/testthat/test-geom_intron.R",
    "chars": 3860,
    "preview": "test_introns <-\n    dplyr::tibble(\n        strand = c(\"+\", \"-\"),\n        tx = c(\"A\", \"B\"),\n        start = c(201, 601),\n"
  },
  {
    "path": "tests/testthat/test-geom_junction.R",
    "chars": 4203,
    "preview": "# manually create the expected introns\ntest_introns <-\n    sod1_annotation %>%\n    dplyr::filter(type == \"exon\") %>%\n   "
  },
  {
    "path": "tests/testthat/test-geom_junction_label_repel.R",
    "chars": 2256,
    "preview": "# manually create the expected introns\ntest_introns <-\n    sod1_annotation %>%\n    dplyr::filter(\n        type == \"exon\""
  },
  {
    "path": "tests/testthat/test-geom_range.R",
    "chars": 1618,
    "preview": "# create dummy exons for testing\ntest_exons <-\n    dplyr::tibble(\n        start = c(100, 300, 500, 650),\n        end = s"
  },
  {
    "path": "tests/testthat/test-shorten_gaps.R",
    "chars": 8793,
    "preview": "test_exons <-\n    dplyr::tibble(\n        seqnames = \"1\",\n        start = c(100, 300, 500, 650),\n        end = start + 10"
  },
  {
    "path": "tests/testthat/test-to_diff.R",
    "chars": 2123,
    "preview": "sod1_exons <- sod1_annotation %>%\n    dplyr::filter(type == \"exon\")\n\nmane <- sod1_exons %>%\n    dplyr::filter(transcript"
  },
  {
    "path": "tests/testthat/test-to_intron.R",
    "chars": 1251,
    "preview": "# create dummy exons for testing\ntest_exons <-\n    dplyr::tibble(\n        start = c(100, 300, 500, 650),\n        end = s"
  },
  {
    "path": "tests/testthat/test-utils.R",
    "chars": 1557,
    "preview": "# create dummy exons for testing\ntest_exons <-\n    dplyr::tibble(\n        start = c(100, 300, 500, 650),\n        end = s"
  },
  {
    "path": "tests/testthat.R",
    "chars": 68,
    "preview": "library(testthat)\nlibrary(ggtranscript)\n\ntest_check(\"ggtranscript\")\n"
  },
  {
    "path": "vignettes/.gitignore",
    "chars": 11,
    "preview": "*.html\n*.R\n"
  },
  {
    "path": "vignettes/ggtranscript.Rmd",
    "chars": 17446,
    "preview": "---\ntitle: \"Getting started\"\nauthor: \n  - name: David Zhang\n    affiliation:\n    - UCL\n    email: dyzhang32@gmail.com\nou"
  }
]

// ... and 3 more files (download for full content)

About this extraction

This page contains the full source code of the dzhang32/ggtranscript GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 62 files (227.7 KB), approximately 64.8k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!