Repository: SteffenMoritz/imputeTS Branch: master Commit: bca3fcd312d5 Files: 180 Total size: 1.2 MB Directory structure: gitextract_ov74y_ad/ ├── .Rbuildignore ├── .github/ │ ├── .gitignore │ └── workflows/ │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ └── test-coverage.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE.txt ├── NAMESPACE ├── NEWS.md ├── R/ │ ├── .Rapp.history │ ├── RcppExports.R │ ├── deprecated_defunct.R │ ├── ggplot_na_distribution.R │ ├── ggplot_na_distribution2.R │ ├── ggplot_na_gapsize.R │ ├── ggplot_na_gapsize2.R │ ├── ggplot_na_imputations.R │ ├── imputeTS-package.R │ ├── internal_algorithm_interface.R │ ├── na_interpolation.R │ ├── na_kalman.R │ ├── na_locf.R │ ├── na_ma.R │ ├── na_mean.R │ ├── na_random.R │ ├── na_remove.R │ ├── na_replace.R │ ├── na_seadec.R │ ├── na_seasplit.R │ ├── statsNA.R │ ├── tsAirgap.R │ ├── tsAirgapComplete.R │ ├── tsHeating.R │ ├── tsHeatingComplete.R │ ├── tsNH4.R │ └── tsNH4Complete.R ├── README.md ├── _pkgdown.yaml ├── codecov.yml ├── data/ │ ├── tsAirgap.rda │ ├── tsAirgapComplete.rda │ ├── tsHeating.rda │ ├── tsHeatingComplete.rda │ ├── tsNH4.rda │ └── tsNH4Complete.rda ├── docs/ │ ├── 404.html │ ├── articles/ │ │ ├── gallery_visualizations.html │ │ ├── gallery_visualizations_files/ │ │ │ ├── accessible-code-block-0.0.1/ │ │ │ │ └── empty-anchor.js │ │ │ ├── header-attrs-2.16/ │ │ │ │ └── header-attrs.js │ │ │ └── header-attrs-2.7/ │ │ │ └── header-attrs.js │ │ └── index.html │ ├── authors.html │ ├── bootstrap-toc.css │ ├── bootstrap-toc.js │ ├── docsearch.css │ ├── docsearch.js │ ├── index.html │ ├── news/ │ │ └── index.html │ ├── pkgdown.css │ ├── pkgdown.js │ ├── pkgdown.yml │ ├── reference/ │ │ ├── figures/ │ │ │ └── Cheat_Sheet_imputeTS.pptx │ │ ├── ggplot_na_distribution.html │ │ ├── ggplot_na_distribution2.html │ │ ├── ggplot_na_gapsize.html │ │ ├── ggplot_na_gapsize2.html │ │ ├── ggplot_na_imputations.html │ │ ├── ggplot_na_intervals.html │ │ ├── ggplot_na_level.html │ │ ├── ggplot_na_level2.html │ │ ├── ggplot_na_pattern.html │ │ ├── imputeTS-package.html │ │ ├── imputeTS.html │ │ ├── index.html │ │ ├── na.interpolation.html │ │ ├── na.kalman.html │ │ ├── na.locf.html │ │ ├── na.ma.html │ │ ├── na.mean.html │ │ ├── na.random.html │ │ ├── na.remove.html │ │ ├── na.replace.html │ │ ├── na.seadec.html │ │ ├── na.seasplit.html │ │ ├── na_interpolation.html │ │ ├── na_kalman.html │ │ ├── na_locf.html │ │ ├── na_ma.html │ │ ├── na_mean.html │ │ ├── na_random.html │ │ ├── na_remove.html │ │ ├── na_replace.html │ │ ├── na_seadec.html │ │ ├── na_seasplit.html │ │ ├── plotNA.distribution.html │ │ ├── plotNA.distributionBar.html │ │ ├── plotNA.gapsize.html │ │ ├── plotNA.imputations.html │ │ ├── reexports.html │ │ ├── statsNA.html │ │ ├── tsAirgap.html │ │ ├── tsAirgapComplete.html │ │ ├── tsHeating.html │ │ ├── tsHeatingComplete.html │ │ ├── tsNH4.html │ │ └── tsNH4Complete.html │ └── sitemap.xml ├── imputeTS.Rproj ├── inst/ │ └── CITATION ├── man/ │ ├── ggplot_na_distribution.Rd │ ├── ggplot_na_distribution2.Rd │ ├── ggplot_na_gapsize.Rd │ ├── ggplot_na_gapsize2.Rd │ ├── ggplot_na_imputations.Rd │ ├── ggplot_na_intervals.Rd │ ├── imputeTS-package.Rd │ ├── na.interpolation.Rd │ ├── na.kalman.Rd │ ├── na.locf.Rd │ ├── na.ma.Rd │ ├── na.mean.Rd │ ├── na.random.Rd │ ├── na.remove.Rd │ ├── na.replace.Rd │ ├── na.seadec.Rd │ ├── na.seasplit.Rd │ ├── na_interpolation.Rd │ ├── na_kalman.Rd │ ├── na_locf.Rd │ ├── na_ma.Rd │ ├── na_mean.Rd │ ├── na_random.Rd │ ├── na_remove.Rd │ ├── na_replace.Rd │ ├── na_seadec.Rd │ ├── na_seasplit.Rd │ ├── plotNA.distribution.Rd │ ├── plotNA.distributionBar.Rd │ ├── plotNA.gapsize.Rd │ ├── plotNA.imputations.Rd │ ├── reexports.Rd │ ├── statsNA.Rd │ ├── tsAirgap.Rd │ ├── tsAirgapComplete.Rd │ ├── tsHeating.Rd │ ├── tsHeatingComplete.Rd │ ├── tsNH4.Rd │ └── tsNH4Complete.Rd ├── src/ │ ├── RcppExports.cpp │ ├── locf.cpp │ └── ma.cpp ├── tests/ │ ├── testthat/ │ │ ├── test-apply_base_algorithm.R │ │ ├── test-depreciated_defunct.R │ │ ├── test-error_handling.R │ │ ├── test-ggplot_na_distribution.R │ │ ├── test-ggplot_na_distribution2.R │ │ ├── test-ggplot_na_gapsize.R │ │ ├── test-ggplot_na_gapsize2.R │ │ ├── test-ggplot_na_imputations.R │ │ ├── test-input-na_advanced-tsObjects.R │ │ ├── test-na_interpolation.R │ │ ├── test-na_kalman.R │ │ ├── test-na_locf.R │ │ ├── test-na_ma.R │ │ ├── test-na_mean.R │ │ ├── test-na_random.R │ │ ├── test-na_remove.R │ │ ├── test-na_replace.R │ │ ├── test-na_seadec.R │ │ ├── test-na_seasplit.R │ │ ├── test-parameter-maxgap.R │ │ └── test-statsNA.R │ └── testthat.R └── vignettes/ ├── Cheat_Sheet_imputeTS.pdf.asis ├── Cheat_Sheet_imputeTS.pptx ├── RJournal.sty ├── gallery_visualizations.Rmd └── imputeTS-Time-Series-Missing-Value-Imputation-in-R.ltx ================================================ FILE CONTENTS ================================================ ================================================ FILE: .Rbuildignore ================================================ imputeTS-header.png imputeTS-header.jpg imputeTS-logo1800x2100.png ^.*\.Rproj$ ^\.Rproj\.user$ ^.*\.yml$ ^.*\.yaml$ ^.*\.ini$ ^.*\.txt$ ^.*\.pptx$ ^appveyor\.yml$ Icon Icon? Docs ^doc$ ^Meta$ ^revdep ^\.github$ ^codecov\.yml$ ================================================ FILE: .github/.gitignore ================================================ *.html ================================================ FILE: .github/workflows/R-CMD-check.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/master/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help # # NOTE: This workflow is overkill for most R packages and # check-standard.yaml is likely a better choice. # usethis::use_github_action("check-standard") will install it. on: push: branches: [main, master] pull_request: branches: [main, master] name: R-CMD-check jobs: R-CMD-check: runs-on: ${{ matrix.config.os }} name: ${{ matrix.config.os }} (${{ matrix.config.r }}) strategy: fail-fast: false matrix: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} # exercise older Windows toolchain via oldrel-4 (R 4.1.x, rtools40) - {os: windows-latest, r: 'oldrel-4'} - {os: ubuntu-latest, r: 'release'} - {os: ubuntu-latest, r: 'oldrel-1'} - {os: ubuntu-latest, r: 'oldrel-2'} - {os: ubuntu-latest, r: 'oldrel-3'} - {os: ubuntu-latest, r: 'oldrel-4'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes steps: - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 - uses: r-lib/actions/setup-tinytex@v2 - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} http-user-agent: ${{ matrix.config.http-user-agent }} use-public-rspm: true - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: rcmdcheck - name: Install dependencies run: | tinytex::parse_install(text = "! LaTeX Error: File `etex.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `tikz.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `fancyhdr.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `microtype.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `setspace.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `titlesec.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `placeins.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `caption.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `environ.sty' not found.") tinytex::parse_install(text = "! LaTeX Error: File `upquote.sty' not found.") tinytex::parse_install(text = "! Font OML/zplm/m/it/9=zplmr7m at 9.0pt not loadable: Metric (TFM) file not found.") tinytex::parse_install(text = "! Font T1/ppl/m/n/10=pplr8t at 10.0pt not loadable: Metric (TFM) file not found.") shell: Rscript {0} - uses: r-lib/actions/check-r-package@v2 - name: Show testthat output if: always() run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true shell: bash - name: Upload check results if: failure() uses: actions/upload-artifact@main with: name: ${{ runner.os }}-r${{ matrix.config.r }}-results path: check ================================================ FILE: .github/workflows/pkgdown.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/master/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] release: types: [published] workflow_dispatch: name: pkgdown jobs: pkgdown: runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 - uses: r-lib/actions/setup-r@v2 with: use-public-rspm: true - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: pkgdown needs: website - name: Install the package run: R CMD INSTALL . - name: Deploy package run: | git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' ================================================ FILE: .github/workflows/pr-commands.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/master/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: issue_comment: types: [created] name: Commands jobs: document: if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} name: document runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v4 - uses: r-lib/actions/pr-fetch@v2 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: r-lib/actions/setup-r@v2 with: use-public-rspm: true - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: roxygen2 - name: Document run: Rscript -e 'roxygen2::roxygenise()' - name: commit run: | git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" git add man/\* NAMESPACE git commit -m 'Document' - uses: r-lib/actions/pr-push@v1 with: repo-token: ${{ secrets.GITHUB_TOKEN }} style: if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} name: style runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v4 - uses: r-lib/actions/pr-fetch@v2 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: r-lib/actions/setup-r@v2 - name: Install dependencies run: Rscript -e 'install.packages("styler")' - name: Style run: Rscript -e 'styler::style_pkg()' - name: commit run: | git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" git add \*.R git commit -m 'Style' - uses: r-lib/actions/pr-push@v2 with: repo-token: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/test-coverage.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/master/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] pull_request: branches: [main, master] name: test-coverage jobs: test-coverage: runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: use-public-rspm: true - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: covr - name: Test coverage run: covr::codecov() shell: Rscript {0} ================================================ FILE: .gitignore ================================================ .Rproj.user /.Rhistory /.dropbox /desktop.ini /.RData Icon? Icon *.o *.dll *.so doc Meta /doc/ /Meta/ ================================================ FILE: DESCRIPTION ================================================ Package: imputeTS Version: 3.4 Date: 2025-08-25 Title: Time Series Missing Value Imputation Description: Imputation (replacement) of missing values in univariate time series. Offers several imputation functions and missing data plots. Available imputation algorithms include: 'Mean', 'LOCF', 'Interpolation', 'Moving Average', 'Seasonal Decomposition', 'Kalman Smoothing on Structural Time Series models', 'Kalman Smoothing on ARIMA models'. Published in Moritz and Bartz-Beielstein (2017) . Author: Steffen Moritz [aut, cre, cph] (), Sebastian Gatscha [aut], Earo Wang [ctb] (), Ron Hause [ctb] () Authors@R: c( person("Steffen", "Moritz", email="steffen.moritz10@gmail.com", role=c("aut", "cre", "cph"), comment = c(ORCID = "0000-0002-0085-1804")), person("Sebastian", "Gatscha", email="sebastian_gatscha@gmx.at", role="aut"), person("Earo", "Wang", email = "earo.wang@gmail.com", role = c("ctb"), comment = c(ORCID = "0000-0001-6448-5260")), person("Ron", "Hause", email = "ronaldhause@gmail.com", role = c("ctb"), comment = c(ORCID = "0000-0002-5229-7366")) ) Maintainer: Steffen Moritz LazyData: yes Type: Package ByteCompile: TRUE BugReports: https://github.com/SteffenMoritz/imputeTS/issues URL: https://github.com/SteffenMoritz/imputeTS, https://steffenmoritz.github.io/imputeTS/ Repository: CRAN Depends: R (>= 3.6) Imports: stats, grDevices, ggplot2 (>= 3.3.0), ggtext, stinepack, forecast, magrittr, methods, Rcpp Suggests: testthat, R.rsp, knitr, zoo, timeSeries, tis, xts, tibble, tsibble, rmarkdown, covr License: GPL-3 VignetteBuilder: R.rsp, knitr, rmarkdown RoxygenNote: 7.3.2 Roxygen: list(markdown = TRUE) LinkingTo: Rcpp Encoding: UTF-8 ================================================ FILE: LICENSE.txt ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. {one line to give the program's name and a brief idea of what it does.} Copyright (C) {year} {name of author} This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: {project} Copyright (C) {year} {fullname} This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: NAMESPACE ================================================ # Generated by roxygen2: do not edit by hand export("%>%") export(ggplot_na_distribution) export(ggplot_na_distribution2) export(ggplot_na_gapsize) export(ggplot_na_gapsize2) export(ggplot_na_imputations) export(ggplot_na_intervals) export(na.interpolation) export(na.kalman) export(na.locf) export(na.ma) export(na.mean) export(na.random) export(na.remove) export(na.replace) export(na.seadec) export(na.seasplit) export(na_interpolation) export(na_kalman) export(na_locf) export(na_ma) export(na_mean) export(na_random) export(na_remove) export(na_replace) export(na_seadec) export(na_seasplit) export(plotNA.distribution) export(plotNA.distributionBar) export(plotNA.gapsize) export(plotNA.imputations) export(statsNA) import(stats) importFrom(Rcpp,sourceCpp) importFrom(forecast,auto.arima) importFrom(forecast,findfrequency) importFrom(ggplot2,aes) importFrom(ggplot2,after_stat) importFrom(ggplot2,alpha) importFrom(ggplot2,coord_flip) importFrom(ggplot2,element_blank) importFrom(ggplot2,element_text) importFrom(ggplot2,geom_bar) importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_text) importFrom(ggplot2,ggplot) importFrom(ggplot2,ggtitle) importFrom(ggplot2,guide_legend) importFrom(ggplot2,guides) importFrom(ggplot2,labs) importFrom(ggplot2,position_dodge) importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,scale_fill_gradientn) importFrom(ggplot2,scale_fill_manual) importFrom(ggplot2,scale_size_identity) importFrom(ggplot2,scale_x_continuous) importFrom(ggplot2,scale_x_discrete) importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,stat_bin) importFrom(ggplot2,theme) importFrom(ggplot2,theme_classic) importFrom(ggplot2,theme_linedraw) importFrom(ggplot2,xlab) importFrom(ggplot2,ylab) importFrom(ggtext,element_markdown) importFrom(grDevices,heat.colors) importFrom(grDevices,nclass.Sturges) importFrom(magrittr,"%>%") importFrom(methods,hasArg) importFrom(stats,KalmanRun) importFrom(stats,KalmanSmooth) importFrom(stats,StructTS) importFrom(stats,approx) importFrom(stats,arima) importFrom(stats,frequency) importFrom(stats,median) importFrom(stats,runif) importFrom(stats,spline) importFrom(stats,stl) importFrom(stats,ts) importFrom(stinepack,stinterp) importFrom(utils,globalVariables) useDynLib(imputeTS) ================================================ FILE: NEWS.md ================================================ # Changes in Version 3.4 * Added ggplot_na_gapsize2 plot (and unit tests). Nice way to illustrate how different NA gapsizes (consecutive NAs in a row) amount for NA totals. * Fix of the CITATION file to comply with newer CRAN rules * Update of ggplot_na_imputations() to avoid using now depreciated ggplot2 options * Update of unit tests for all plotting functions (ggplot_na_...). Now using is_ggplot() to check for correct output. This was necessary because of a major ggplot2 update (switch to s7 classes). # Changes in Version 3.3 Thanks to Sabrina Krys, Kevin Villalobos, Tracy Shen, hezhichao1991, englianhu for bug / issue reporting. Thanks to RicardaP for fixing documentation error. Thanks to Ronald Hause for the commit to optimize parameter pass trough from approx to na_interpolation. * Renamed ggplot_na_intervals to ggplot_na_distribution2 * Updates to ggplot_na_gapsize: Space between the bars adjusted for better optics. Added parameters for directly choosing the bar border color and alpha value for filling of the bars. * Improved notification message for na_seadec/na_seasplit when find_frequency couldn't find a seasonal pattern. * Corrected error in na_kalman documentation - auto.arima was wrongly described as default parameter choice, while in reality it is StructTS (reported by RicardaP) * Changes for the error handling. (**These changes got reverted and did not make it into the CRAN release**). For some specific cases the input checks performed by imputeTS stop pipe workflows in their entirety. E.g. a problem when group_by leads to all NA subsets - which fail the input check and then stop the whole pipe workflow. To prevent this, stop() is only called, when the user supplied imputeTS algorithm parameter options are wrong or misspelled. Unsupported input data will only give a warning() (and do not perform any action on the data). Thus, there is no call to stop(), that cancels the whole pipe workflow. (issue reported by Sabrina Krys). This works fine, but after closer consideration we figured people fail to notice warnings way too often and thus it is more user friendly to clearly stop with an error for these issues. After all, the users data analysis clearly profits from taking a closer look in these specific cases. If you are anyhow interested in the version without the reverted changes, it can be installed from github with the following command: devtools::install_github("https://github.com/SteffenMoritz/imputeTS/commit/aaf759216b4091e36dee6e8e3a10185ff8f4647b") * Improved error messages (especially for multivariate inputs) and unit tests for the warnings and errors. * Corrected typo in 'Input data needs at least x non-NA data points' error message * Better parameter pass trough from approx to na_interpolation- Added capability to alter rule for linear extrapolation outside the interval [min(x), max(x)] (commit by Ronald Hause) * Improved na_interpolation documentation (more information about possible parameter pass through from underlying spline, approx,stinterp functions) * Additional unit tests * Moved to Github Actions instead of TravisCI / AppVeyor. * Bugfix for "Error in optim(init[mask], getLike, method = "L-BFGS-B", lower = rep(0, : L-BFGS-B needs finite values of 'fn'.", which comes for completely constant input to na_kalman e.g. 4,4,4,NA,4,4. (reported by Kevin Villalobos, Tracy Shen, hezhichao1991, englianhu) * Improved na_seadec documentation (algorithm details) * Changed R Version requirement in Description to R (>= 3.6) since imported packages like ggtext and also some testthat tests were already requiring newer versions than the old R (≥ 3.0.1) requirement of imputeTS # Changes in Version 3.2 Thanks to Mark J. Lamias for bug / issue reporting. Thanks to Cyrus Mohammadian for bug reporting. Thanks to Miroslaw Janik for issue reporting. * Fix to remove CRAN note - removed not used utils from DESCRIPTION imports * Minor fix to ggplot_na_distribution (bars end now at max(timeseries)*1.05) * Typo corrections in statsNA * Specified ggplot2 (>= 3.3.0) in imports, to prevent errors with older ggplot2 versions (reported by Cyrus Mohammadian) * Updated na_locf documentation to make behavior of na_remaining parameter more clear (issue reported by Mark J. Lamias) * ggplot_na_intervals, has now percentages with % sign (e.g. 10%) on y-scale instead of just numbers (e.g. 0,1) (suggestion from Miroslaw Janik) * Added some figures and the Cheat Sheet .pptx to .Rbuildignore to avoid CRAN warning about package size. These files and figures were not needed for the CRAN version. # Changes in Version 3.1 Thanks to Johannes Menzel for bug reporting, Thanks to Jan (jmablans) for bug reporting. Thanks to Earo Wang for speedup of plotNA.gapsize. Special Thanks to Sebastian Gatscha for plotting functions, new na_mean options, new unit tests. * Plotting functions are all in ggplot now (way better looking). Additionally they got renamed accordingly ggplot_na_distribution, ggplot_na_intervals, ggplot_na_gapsize, ggplot_na_imputations. * Speedup for plotNA_gapsize calculation (now renamed ggplot_na_gapsize) (thx to Earo Wang) * Added harmonic and geometric mean as option for na_mean * Removed bug in na_replace - it can now be used with all NA vectors since it requires no minimum of non-NA values (reported by Jan - jmablans) * Improved na.random input check (usable with all NA input now if upper and lower bound parameters are explicitly set to numeric values) * Additional unit tests for the plotting functions * Additional unit tests for the all imputation functions (testing all NA input) * Update for testthat unit tests * Fixed a mistake in README.md (reported by Johannes Menzel) * Added to statsNA: Number of Gaps, Average Gap Size + reformatting of code + compatibility with other ts objects * Documentation improvements through newer roxygen version (Markup now possible in documentation) * updated Readme + Vignette to new function names * Added the imputeTS Cheat Sheet as Vignette * Added new vignette Gallery Missing Data Visualizations * Added revdep # Changes in Version 3.0 Thanks to Jim Maas, shreydesai, Breza, CameronNemo for reporting bugs. Thanks to Sebastian Gatscha providing the (way faster) C++ na.ma() implementation. * tibble and tstibble compatibility * Reworked internal code documentation * na.ma speed up via C++ * Changed vignette builder to R.rsp * Used R package styler package to optimize source code readability * Made some changes to better follow tidyverse style guide * Replaced na. with na_ e.g. na.mean with na_mean usw.This fits better to modern code style guidelines. The old function names will still work for a while, but give a warning. * Added findFrequency option to na.seadec and na.seasplit * Added maxgap option * Fixed bug for na.seadec - also imputed known values in some special cases (reported by CameronNemo) * Added doi: 10.32614/RJ-2017-009 to describtion, references, readme and citation file * Added StackExchange link to Readme * Moved stinepack from imports to suggested * Internal reorganization of imports - now always using pkg::function and importFrom pkg x1 x2 x3instead of just import pkg * Fixed bug in na.ma when using xts time series with NA at the end * Fixed error message in na.interpolation if wrong parameter is given stop("Wrong parameter 'option' given. Value must be either 'linear', 'spline' or 'stine'.") (reported by Breza) * Fixed spelling mistakes in na.seadec and na.seasplit (reported by shreydesai) * Fixed bug with na.random() output (reported by Jim Maas) # Changes in Version 2.7 * Updated Description: Orcid Id added, packages required for unit test add as "Suggested" * Small correction in README.md, small update to citation file * Replaced NEWS with NEWS.md for better formatting # Changes in Version 2.6 * Updated citation file * Minor changes to vignette # Changes in Version 2.5 * Adjusted unit test to a update of forecast package # Changes in Version 2.4 * Small speed improvments for na.kalman * Improved input check for all functions * Bugfix for unit tests * Changes to unit test (because of zoo update) # Changes in Version 2.3 * Bugfix for na.kalman with integer input * Readme Update * Improved error messages for na.seasplit and na.seadec * Minor vignette changes # Changes in Version 2.2 * Bugfix for na.locf (also concerned na.kalman) # Changes in Version 2.1 * Fixed for problems with Solaris/Sparc * Fixes for problems with vignette on osx # Changes in Version 2.0 * Bugfix for plots without missing data * Increased performance for na.locf * Minor bugfixes for specific data.frame inputs * Minor bugfixes for specific xts object inputs * Improved Code Documentation * Added new software tests # Changes in Version 1.9 * Added Vignette # Changes in Version 1.8 * Computation time improvments for na.locf (up to 10000 times faster) * Computation time improvments for na.interpolation (up to 10000 times faster) * Computation time improvments for na.kalman (only slightly faster, under 10%) * Fixed unnecessary warning message with some na.kalman options * Adjusted default parameters for plotNA.distributionBar (using nclass.Sturges for breaks parameter) * Fixed issue with too sensitive input checking # Changes in Version 1.7 * Enabled usage of multivariate input (data.frame, mts, matrix,...) for all imputation functions except na.remove. This means users do not have to loop through all columns by themselfes anymore if they want to use the package with multivariate data. The imputation itself is still performend in univariate manner (column after column). * Improved compatibility with different advanced time series objects like zoo and xts. Using the imputation functions with these time series objects should be possible now. These series will not be explicitly named as possible input in the user documentation. Absence of errors can not be guaranteed. However, there are no known issues yet. * Added several things for unit tests with pkg 'testthat' * Added unit tests for every function * Adjusted error messages * Internal Coding style improvement: replaced all T with TRUE and all F with FALSE * Adjustment tsHeating / tsHeatingComplete datasets (set 1440 as frequency parameter) * Adjustment tsNH4 / tsNH4Complete datasets (set 144 as frequency parameter) * Fixes for grammar, spelling and citations in the whole documentation * Revised examples in the documentation for all functions * Restricted output of na.remove to vector only (issue with incorrect time information otherwise) * Added better x-axes labels for plotNA.distribution # Changes in Version 1.6 * Added github links to description file * Added citation file * Updated Readme (badges for travis ci and cran status) * Fix in documentation for na.interpolation (due to outdated descriptions) * Fix in documentation plotNA.distribution / plotNA.distributionBar (due to interchanged descriptions) * Added references to used packages in na.kalman and na.interpolation documentation # Changes in Version 1.5 * Allows now also numeric vectors as input * Removed na.identifier parameter for all functions (too error prone, better handled individually by the user) * Minor changes in na.interpolation with option = "stine" * Added na.ma imputation function * Replaced "data" in all function parameters with the more common "x" * Improvement of all code examples * Renamed heating/heatingComplete dataset to tsHeating/tsHeatingComplete * Renamed nh4/nh4Complete dataset to tsNH4/tsNH4Complete * Added tsAirgap / tsAirgapComplete datasets * Improved imputeTS-package documentation * Added na.kalman imputation function * Added README.md function * Added statsNA function * Added plotNA.gapsize function * Renamed vis.imputations to plotNA.imputations * Renamed vis.barMissing to plotNA.distributionBar * Renamed vis.missing to plotNA.distribution * Fixed issues with parameter pass through and legend for all plotting functions * Improved dataset documentation # Changes in Version 0.4 * Update of vis.differences (better looking plot now) * Added vis.missing to visualize the distribution of missing data in a time series * Added vis.barMissing, which is especially suited to visualize missing data in very huge time series * Update na.interpolate (added Stineman interpolation and enabled ... parameter for all interpolation algorithms to pass through parameters to the underlying functions) # Changes in Version 0.3 * Added two datasets of sensor data * vis.differences for plotting differences between real and imputed values # Changes in Version 0.2 * Removed internal functions from visible package documentation * Added additional algorithms: na.seasplit and na.seadec * internal function for algorithm selection # Changes in Version 0.1 * Created initial version of imputeTS package for univariate time series imputation * added the simple imputation functions: na.locf, na.mean, na.random, na.interpolation, na.replace * added na.remove function for removing all NAs from a time series ================================================ FILE: R/.Rapp.history ================================================ ================================================ FILE: R/RcppExports.R ================================================ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 locf <- function(x, reverse) { .Call('_imputeTS_locf', PACKAGE = 'imputeTS', x, reverse) } ma <- function(x, k, weighting) { .Call('_imputeTS_ma', PACKAGE = 'imputeTS', x, k, weighting) } ================================================ FILE: R/deprecated_defunct.R ================================================ #--------------------------------------------------------------# # Collection of DEPRECATED AND DEFUNCT FUNCTIONS #--------------------------------------------------------------# #--------------------------------------------------------------# # IMPUTATION FUNCTIONS # Old na. imputation functions, replaced by na_ # Deprecated since Version 3.0 (2019-07-01) #--------------------------------------------------------------# # na.interpolation() # replaced by na_interpolation #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_interpolation}} instead. #' @description na.interpolation is replaced by \code{\link[imputeTS]{na_interpolation}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_interpolation #' @keywords internal #' @export na.interpolation <- function(x, option = "linear", maxgap = Inf, ...) { .Deprecated( new = "na_interpolation", old = "na.interpolation", msg = "na.interpolation will be replaced by na_interpolation. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_interpolation(x, option, maxgap, ...) } #--------------------------------------------------------------------------------------# # na.kalman() # replaced by na_kalman #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_kalman}} instead. #' @description na.kalman is replaced by \code{\link[imputeTS]{na_kalman}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_kalman #' @keywords internal #' @export na.kalman <- function(x, model = "StructTS", smooth = TRUE, nit = -1, maxgap = Inf, ...) { .Deprecated( new = "na_kalman", old = "na.kalman", msg = "na.kalman will be replaced by na_kalman. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_kalman(x, model, smooth, nit, maxgap, ...) } #--------------------------------------------------------------------------------------# # na.locf() # replaced by na_locf #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_locf}} instead. #' @description na.locf is replaced by \code{\link[imputeTS]{na_locf}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_locf #' @keywords internal #' @export na.locf <- function(x, option = "locf", na.remaining = "rev", maxgap = Inf, ...) { .Deprecated( new = "na_locf", old = "na.locf", msg = "na.locf will be replaced by na_locf. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_locf(x, option, na.remaining, maxgap, ...) } #--------------------------------------------------------------------------------------# # na.ma() # replaced by na_ma #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_ma}} instead. #' @description na.ma is replaced by \code{\link[imputeTS]{na_ma}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_ma #' @keywords internal #' @export na.ma <- function(x, k = 4, weighting = "exponential", maxgap = Inf, ...) { .Deprecated( new = "na_ma", old = "na.ma", msg = "na.ma will be replaced by na_ma. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_ma(x, k, weighting, maxgap, ...) } #--------------------------------------------------------------------------------------# # na.mean() # replaced by na_mean #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_mean}} instead. #' @description na.mean is replaced by \code{\link[imputeTS]{na_mean}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_mean #' @keywords internal #' @export na.mean <- function(x, option = "mean", maxgap = Inf, ...) { .Deprecated( new = "na_mean", old = "na.mean", msg = "na.mean will be replaced by na_mean. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_mean(x, option, maxgap, ...) } #--------------------------------------------------------------------------------------# # na.random() # replaced by na_random #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_random}} instead. #' @description na.random is replaced by \code{\link[imputeTS]{na_random}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_random #' @keywords internal #' @export na.random <- function(x, lower_bound = NULL, upper_bound = NULL, maxgap = Inf, ...) { .Deprecated( new = "na_random", old = "na.random", msg = "na.random will be replaced by na_random. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_random(x, lower_bound, upper_bound, maxgap, ...) } #--------------------------------------------------------------------------------------# # na.remove() # replaced by na_remove #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_remove}} instead. #' @description na.remove is replaced by \code{\link[imputeTS]{na_remove}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_remove #' @keywords internal #' @export na.remove <- function(x, ...) { .Deprecated( new = "na_remove", old = "na.remove", msg = "na.remove will be replaced by na_remove. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_remove(x, ...) } #--------------------------------------------------------------------------------------# # na.replace() # replaced by na_replace #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_replace}} instead. #' @description na.replace is replaced by \code{\link[imputeTS]{na_replace}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_replace #' @keywords internal #' @export na.replace <- function(x, fill = 0, maxgap = Inf, ...) { .Deprecated( new = "na_replace", old = "na.replace", msg = "na.replace will be replaced by na_replace. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_replace(x, fill, maxgap, ...) } #--------------------------------------------------------------------------------------# # na.seadec() # replaced by na_seadec #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_seadec}} instead. #' @description na.seadec is replaced by \code{\link[imputeTS]{na_seadec}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_seadec #' @keywords internal #' @export na.seadec <- function(x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ...) { .Deprecated( new = "na_seadec", old = "na.seadec", msg = "na.seadec will be replaced by na_seadec. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_seadec(x, algorithm, find_frequency, maxgap, ...) } #--------------------------------------------------------------------------------------# # na.seasplit() # replaced by na_seasplit #--------------------------------------------------------------------------------------# #' Deprecated use \code{\link[imputeTS]{na_seasplit}} instead. #' @description na.seasplit is replaced by \code{\link[imputeTS]{na_seasplit}}. #' The functionality stays the same. The new name better fits modern R code #' style guidelines (which prefer _ over . in function names). #' @inheritParams na_seasplit #' @keywords internal #' @export na.seasplit <- function(x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ...) { .Deprecated( new = "na_seasplit", old = "na.seasplit", msg = "na.seasplit will be replaced by na_seasplit. Functionality stays the same. The new function name better fits modern R code style guidelines. Please adjust your code accordingly." ) na_seasplit(x, algorithm, find_frequency, maxgap, ...) } #--------------------------------------------------------------------------------------# #--------------------------------------------------------------# # PLOTTING FUNCTIONS # Old plotNA. visualization functions, replaced by ggplot_na_ # Deprecated since Version 3.1 (2020-07-30) #--------------------------------------------------------------# # plotNA.distribution() # replaced by ggplot_na_distribution #--------------------------------------------------------------------------------------# #' @title Discontinued - Use \code{\link[imputeTS]{ggplot_na_distribution}} instead. #' @description plotNA.distribution was replaced by \code{\link[imputeTS]{ggplot_na_distribution}}. #' The new plotting function provides an improved version of the old plot, e.g. it looks better now and is better adjustable, #' because it is based on ggplot2. If you absolutely want to use the old function, #' you need to download an older package version. Versions 3.0 and below still have the old functions. #' @keywords internal #' @export plotNA.distribution <- function(x, ... ) { .Defunct( new = "ggplot_na_distribution", msg = " plotNA.distribution was replaced by ggplot_na_distribution. Use this function instead. The plot itself is the same, but looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to manually download an older package version. (Versions 3.0 and below still have the old functions)" ) } #--------------------------------------------------------------------------------------# # ggplot_na_intervals # replaced by ggplot_na_intervals #--------------------------------------------------------------------------------------# #' @title Discontinued - Use \code{\link[imputeTS]{ggplot_na_distribution2}} instead. #' @description plotNA.distributionBar was replaced by \code{\link[imputeTS]{ggplot_na_distribution2}}. #' The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, #' because it is based on ggplot2. If you absolutely want to use the old function, #' you need to download an older package version. Versions 3.0 and below still have the old functions. #' @keywords internal #' @export ggplot_na_intervals <- function(x, ... ) { .Defunct( new = "ggplot_na_distribution2", msg = " ggplot_na_intervals was renamed to ggplot_na_distribution2. Use this function instead. Functionality stays the same. " ) } #--------------------------------------------------------------------------------------# # plotNA.distributionBar() # replaced by ggplot_na_intervals #--------------------------------------------------------------------------------------# #' @title Discontinued - Use \code{\link[imputeTS]{ggplot_na_distribution2}} instead. #' @description plotNA.distributionBar was replaced by \code{\link[imputeTS]{ggplot_na_distribution2}}. #' The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, #' because it is based on ggplot2. If you absolutely want to use the old function, #' you need to download an older package version. Versions 3.0 and below still have the old functions. #' @keywords internal #' @export plotNA.distributionBar <- function(x, ... ) { .Defunct( new = "ggplot_na_distribution2", msg = " plotNA.distributionBar was replaced by ggplot_na_distribution2. Use this function instead. The plot itself is the same, but looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to manually download an older package version. (Versions 3.0 and below still have the old functions)" ) } #--------------------------------------------------------------------------------------# # plotNA.gapsize() # replaced by ggplot_na_gapsize #--------------------------------------------------------------------------------------# #' @title Discontinued - Use \code{\link[imputeTS]{ggplot_na_gapsize}} instead. #' @description plotNA.gapsize was replaced by \code{\link[imputeTS]{ggplot_na_gapsize}}. #' The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, #' because it is based on ggplot2. If you absolutely want to use the old function, #' you need to download an older package version. Versions 3.0 and below still have the old functions. #' @keywords internal #' @export plotNA.gapsize <- function(x, ... ) { .Defunct( new = "ggplot_na_gapsize", msg = " plotNA.gapsize was replaced by ggplot_na_gapsize. Use this function instead. The plot itself is the same, but looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to manually download an older package version. (Versions 3.0 and below still have the old functions)" ) } #--------------------------------------------------------------------------------------# # plotNA.imputations() # replaced by ggplot_na_imputations #--------------------------------------------------------------------------------------# #' @title Discontinued - Use \code{\link[imputeTS]{ggplot_na_imputations}} instead. #' @description plotNA.imputations was replaced by \code{\link[imputeTS]{ggplot_na_imputations}}. #' The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, #' because it is based on ggplot2. If you absolutely want to use the old function, #' you need to download an older package version. Versions 3.0 and below still have the old functions. #' @keywords internal #' @export plotNA.imputations <- function(x, ... ) { .Defunct( new = "ggplot_na_imputations", msg = " plotNA.imputations was replaced by ggplot_na_imputations. Use this function instead. The plot itself is the same, but looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to manually download an older package version. (Versions 3.0 and below still have the old functions)" ) } #--------------------------------------------------------------------------------------# ================================================ FILE: R/ggplot_na_distribution.R ================================================ #' @title Line Plot to Visualize the Distribution of Missing Values #' #' @description Visualize the distribution of missing values within a time series. #' #' @param x Numeric Vector (\code{\link[base]{vector}}) or Time Series #' (\code{\link[stats]{ts}}) object containing NAs. This is the only mandatory #' parameter - all other parameters are only needed for adjusting the plot appearance. #' #' @param x_axis_labels For adding specific x-axis labels. Takes a vector of #' \code{\link[base]{Date}} or \code{\link[base]{POSIXct}} objects #' as an input (needs the same length as x) . The Default (NULL) uses the #' observation numbers as x-axis tick labels. #' #' @param color_points Color for the Symbols/Points. #' #' @param color_lines Color for the Lines. #' #' @param color_missing Color used for highlighting the time spans with NA values. #' #' @param color_missing_border Color used as border for time spans with NA values. #' #' @param alpha_missing Alpha (transparency) value used for color_missing. #' #' @param title Title of the Plot (NULL for deactivating title). #' #' @param subtitle Subtitle of the Plot (NULL for deactivating subtitle). #' #' @param xlab Label for x-Axis. #' #' @param ylab Label for y-Axis. #' #' @param shape_points Symbol to use for the Observations/Points. See #' https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference. #' #' @param size_points Size of Symbols/Points. #' #' @param theme Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). #' (\code{\link[ggplot2]{theme_linedraw})} #' #' @details This function visualizes the distribution of missing values within #' a time series. If a value is NA, the background is colored differently. #' This gives a good overview of where most missing values occur. #' #' The only really needed parameter for this function is x (the univariate #' time series that shall be visualized). All other parameters are solely #' for altering the appearance of the plot. #' #' As long as the input is univariate and numeric the function also takes #' data.frame, tibble, tsibble, zoo, xts as an input. #' #' The plot can be adjusted to your needs via the function parameters. #' Additionally, for more complex adjustments, the output can also be #' adjusted via ggplot2 syntax. This is possible, since the output #' of the function is a ggplot2 object. Also take a look at the Examples #' to see how adjustments are made. #' #' For very long time series it might happen, that the plot gets too crowded #' and overplotting issues occur. In this case the #' \code{\link[imputeTS]{ggplot_na_distribution2}} plotting function can provide #' a more condensed overview. #' #' #' @author Steffen Moritz, Sebastian Gatscha #' #' @seealso \code{\link[imputeTS]{ggplot_na_distribution2}}, #' \code{\link[imputeTS]{ggplot_na_gapsize}}, #' \code{\link[imputeTS]{ggplot_na_gapsize2}}, #' \code{\link[imputeTS]{ggplot_na_imputations}} #' #' @examples #' # Example 1: Visualize the missing values in x #' x <- stats::ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7)) #' ggplot_na_distribution(x) #' #' # Example 2: Visualize the missing values in tsAirgap time series #' ggplot_na_distribution(tsAirgap) #' #' # Example 3: Same as example 1, just written with pipe operator #' x <- ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7)) #' x %>% ggplot_na_distribution() #' #' # Example 4: Visualize NAs in tsAirgap - different color for points #' # Plot adjustments via ggplot_na_distribution function parameters #' ggplot_na_distribution(tsAirgap, color_points = "grey") #' #' # Example 5: Visualize NAs in tsAirgap - different theme #' # Plot adjustments via ggplot_na_distribution function parameters #' ggplot_na_distribution(tsAirgap, theme = ggplot2::theme_classic()) #' #' # Example 6: Visualize NAs in tsAirgap - title, subtitle in center #' # Plot adjustments via ggplot2 syntax #' ggplot_na_distribution(tsAirgap) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + #' ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) #' #' # Example 7: Visualize NAs in tsAirgap - title in center, no subtitle #' # Plot adjustments via ggplot2 syntax and function parameters #' ggplot_na_distribution(tsAirgap, subtitle = NULL) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) #' #' # Example 8: Visualize NAs in tsAirgap - x-axis texts with angle #' # Plot adjustments via ggplot2 syntax and function parameters #' ggplot_na_distribution(tsAirgap, color_points = "grey") + #' ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) #' #' @importFrom ggplot2 theme_linedraw ggplot geom_point aes geom_line geom_bar ggtitle #' xlab ylab theme element_text theme_classic #' #' @importFrom stats ts #' #' @importFrom magrittr %>% #' #' @export ggplot_na_distribution <- function(x, x_axis_labels = NULL, color_points = "steelblue", color_lines = "steelblue2", color_missing = "indianred", color_missing_border = "indianred", alpha_missing = 0.5, title = "Distribution of Missing Values", subtitle = "Time Series with highlighted missing regions", xlab = "Time", ylab = "Value", shape_points = 20, size_points = 2.5, theme = ggplot2::theme_linedraw()) { data <- x ## ## 1. Input Check and Transformation ## # 1.1 special handling data types if (any(class(data) == "tbl_ts")) { data <- as.vector(as.data.frame(data)[, 2]) } else if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.2 Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { stop("x is not univariate. The function only works with univariate input for x. For data types with multiple variables/columns only input the column you want to plot as parameter x.") } # 1.3 Checks and corrections for wrong data dimension # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.4 Input as vector data <- as.vector(data) # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric") } # 1.6 Check preconditions about amount of NAs # exclude NA only inputs missindx <- is.na(data) if (all(missindx)) { stop("Input data consists only of NAs. At least one non-NA numeric value is needed for creating a meaningful ggplot_na_distribution plot)") } ## ## End Input Check and Transformation ## ## ## 2. Preparations ## # 2.1 Create required data # Get NA positions id_na <- which(is.na(data)) # 2.2 Create dataframe for ggplot2 # Define x-axis label data # if Date or POSIXct given for x_axis_labels time information can be plotted if (any(class(x_axis_labels) == "Date")) { time <- x_axis_labels width_na_bar <- as.numeric(time[2] - time[1]) * 0.9 } else if (any(class(x_axis_labels) == "POSIXct")) { time <- x_axis_labels width_na_bar <- as.numeric(difftime(time[2], time[1], units = "secs")) * 0.9 } else if (is.null(x_axis_labels)) { time <- seq_along(data) width_na_bar <- as.numeric(time[2] - time[1]) * 0.9 } else { stop("Input for x_axis_labels is not in a supported format, must a vector of Date or a POSIXct objects with the same length as x") } # Create the remainder of the data.frame for ggplot2 value <- data df <- data.frame(time, value) ## ## End Preparations ## ## ## 3. Create the ggplot2 plot ## # Create the plot gg <- ggplot2::ggplot() + # Adding the Line + Parameters ggplot2::geom_line( data = df, na.rm = T, ggplot2::aes(x = time, y = value), col = color_lines ) + # Adding the Points + Parameters ggplot2::geom_point( data = df, na.rm = TRUE, ggplot2::aes(x = time, y = value), shape = shape_points, col = color_points, size = size_points ) + # Adding additional modifications like title, subtitle, theme,... ggplot2::ggtitle(label = title, subtitle = subtitle) + ggplot2::xlab(xlab) + ggplot2::ylab(ylab) + theme # Add the red background bars for missing data areas if (length(id_na) > 0) { # Red Bars only if missing data in time series na_val <- max(df$value*1.05, na.rm = TRUE) gg <- gg + ggplot2::geom_bar( data = df[is.na(df$value), ], stat = "identity", ggplot2::aes(x = time, y = na_val), col = color_missing_border, fill = color_missing, alpha = alpha_missing, width = width_na_bar ) } ## ## End creating the ggplot2 plot ## return(gg) } ================================================ FILE: R/ggplot_na_distribution2.R ================================================ #' @title Stacked Bar Plot to Visualize Missing Values per Time Interval #' #' @description Visualization of missing values in barplot form. #' Especially useful when looking at specific intervals and for #' time series with a lot of observations. #' #' @param x Numeric Vector (\code{\link[base]{vector}}) or Time Series #' (\code{\link[stats]{ts}}) object containing NAs. This is the only mandatory #' parameter - all other parameters are only needed for adjusting the plot appearance. #' #' @param number_intervals Defines the number of bins to be created. Default #' number of intervals (denoted by NULL) is calculated by \code{\link[grDevices]{nclass.Sturges}} #' using Sturges' formula. If the interval_size parameter is set to a value #' different to NULL this parameter is ignored. #' #' @param interval_size Defines how many observations should be in one bin/interval. #' The required number of overall bins is afterwards calculated automatically. #' If used this parameter overwrites the number_intervals parameter. #' For a very long time series be sure to make the interval_size not extremely #' small, otherwise because of overplotting issues nothing can be seen until #' you also increase the plot width. #' #' @param measure Whether the NA / non-NA ratio should be given as #' percent or absolute numbers. #' #' \itemize{ #' \item{"percent" - for percentages} #' #' \item{"count" - for absolute numbers of NAs} #' } #' #' @param color_missing Color for the amount of missing values. #' #' @param color_existing Color for the amount of existing values. #' #' @param alpha_missing Alpha (transparency) value for the missing values. #' #' @param alpha_existing Alpha (transparency) value for the existing values. #' #' @param title Title of the Plot (NULL for deactivating title). #' #' @param subtitle Subtitle of the Plot (NULL for deactivating subtitle). #' #' @param xlab Label for x-Axis. Automatically set to the current interval size, if #' no custom text is chosen. #' #' @param ylab Label for y-Axis. As default (NULL), the axis is automatically set #' to either 'Percent' or 'Count' dependent on the settings of parameter \code{measure}. #' #' @param color_border Color for the small borders between the intervals/bins. #' Default is 'white'. #' #' @param theme Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). #' (\code{\link[ggplot2]{theme_linedraw})} #' #' @details This function visualizes the distribution of missing values within #' a time series. In comparison to the \code{\link[imputeTS]{ggplot_na_distribution}} #' function this is not done by plotting each observation of the time series #' separately. Instead observations for time intervals are represented as #' intervals/bins of multiple values. For these intervals information about #' the amount of missing values are shown. This has the advantage, that also #' for large time series a plot which is easy to overview can be created. #' #' The only really needed parameter for this function is x (the univariate #' time series that shall be visualized). All other parameters are solely #' for altering the appearance of the plot. #' #' As long as the input is univariate and numeric the function also takes #' data.frame, tibble, tsibble, zoo, xts as an input. #' #' The plot can be adjusted to your needs via the function parameters. #' Additionally, for more complex adjustments, the output can also be #' adjusted via ggplot2 syntax. This is possible, since the output #' of the function is a ggplot2 object. Also take a look at the Examples #' to see how adjustments are made. #' #' @author Steffen Moritz, Sebastian Gatscha #' #' @seealso \code{\link[imputeTS]{ggplot_na_distribution}}, #' \code{\link[imputeTS]{ggplot_na_gapsize}}, #' \code{\link[imputeTS]{ggplot_na_gapsize2}}, #' \code{\link[imputeTS]{ggplot_na_imputations}} #' #' @examples #' # Example 1: Visualize the missing values in tsNH4 time series as percentages #' ggplot_na_distribution2(tsNH4) #' #' # Example 2: Visualize the missing values in tsNH4 time series as counts #' ggplot_na_distribution2(tsNH4, measure = "count") #' #' # Example 3: Visualize the missing values in tsHeating time series #' ggplot_na_distribution2(tsHeating) #' #' # Example 4: Same as example 1, just written with pipe operator #' tsNH4 %>% ggplot_na_distribution2() #' #' # Example 5: Visualize NAs in tsNH4 - exactly 8 intervals #' ggplot_na_distribution2(tsNH4, number_intervals = 8) #' #' # Example 6: Visualize NAs in tsNH4 - 300 observations per interval #' ggplot_na_distribution2(tsNH4, interval_size = 300) #' #' # Example 7: Visualize NAs in tsAirgap - different color for NAs #' # Plot adjustments via ggplot_na_distribution2 function parameters #' ggplot_na_distribution2(tsAirgap, color_missing = "pink") #' #' # Example 8: Visualize NAs in tsNH4 - different theme #' # Plot adjustments via ggplot_na_distribution2 function parameters #' ggplot_na_distribution2(tsNH4, theme = ggplot2::theme_classic()) #' #' # Example 9: Visualize NAs in tsAirgap - title, subtitle in center #' # Plot adjustments via ggplot2 syntax #' ggplot_na_distribution2(tsAirgap) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + #' ggplot2::theme(plot.subtitle = ggtext::element_markdown(hjust = 0.5)) #' #' # Example 10: Visualize NAs in tsAirgap - title in center, no subtitle #' # Plot adjustments via ggplot2 syntax and function parameters #' ggplot_na_distribution2(tsAirgap, subtitle = NULL) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) #' #' # Example 11: Visualize NAs in tsAirgap - x-axis texts with angle #' # Plot adjustments via ggplot2 syntax and function parameters #' ggplot_na_distribution2(tsAirgap, color_missing = "grey") + #' ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) #' #' @importFrom magrittr %>% #' #' @importFrom grDevices nclass.Sturges #' #' @importFrom ggplot2 theme_linedraw alpha ggplot aes scale_fill_manual #' theme element_blank scale_x_continuous scale_y_continuous #' labs xlab ylab stat_bin after_stat theme_classic #' #' @importFrom ggtext element_markdown #' #' @export ggplot_na_distribution2 <- function(x, number_intervals = NULL, interval_size = NULL, measure = "percent", color_missing = "indianred2", color_existing = "steelblue", alpha_missing = 0.8, alpha_existing = 0.3, title = "Missing Values per Interval", subtitle = "Amount of NA and non-NA for successive intervals", xlab = "Time Lapse (Interval Size: XX)", ylab = NULL, color_border = "white", theme = ggplot2::theme_linedraw()) { data <- x ## ## 1. Input Check and Transformation ## # 1.1 special handling data types if (any(class(data) == "tbl_ts")) { data <- as.vector(as.data.frame(data)[, 2]) } else if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.2 Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { stop("x is not univariate. The function only works with univariate input for x. For data types with multiple variables/columns only input the column you want to plot as parameter x.") } # 1.3 Checks and corrections for wrong data dimension # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.4 Input as vector data <- as.vector(data) # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric") } # 1.6 Check preconditions about amount of NAs # exclude NA only inputs missindx <- is.na(data) if (all(missindx)) { stop("Input data consists only of NAs. At least one non-NA numeric value is needed for creating a meaningful ggplot_na_distribution plot)") } ## ## End Input Check and Transformation ## ## ## 2. Preparations ## # 2.1 Calculation default number of intervals if (is.null(number_intervals)) { number_intervals <- grDevices::nclass.Sturges(data) } # 2.2 Calculation break points if (!is.null(interval_size)) { breaks <- seq(from = 0, to = length(data) - 1, by = interval_size) breaks <- c(breaks, length(data)) } else { breaks <- seq(from = 0, to = length(data) - 1, by = floor(length(data) / number_intervals)) breaks <- c(breaks, length(data)) } binwidth <- breaks[2] # 2.3 Process parameter settings # Add alpha values to colors color_missing <- ggplot2::alpha(color_missing, alpha_missing) color_existing <- ggplot2::alpha(color_existing, alpha_existing) # Set subtitle to default # (needed because .Rd usage section gives error when using defaults > 90 chars ) if ( (!is.null(subtitle)) && (subtitle == "Amount of NA and non-NA for successive intervals")) { subtitle <- paste0("Amount of NA and non-NA for successive intervals") } # Set ylab according to choosen measure if (is.null(ylab)) { ifelse(measure == "percent", ylab <- "Percent", ylab <- "Count") } # Set xlab according to choosen parameters if (xlab == "Time Lapse (Interval Size: XX)") { xlab <- paste("Time Lapse (Interval Size:", binwidth, ")") } # 2.4 Create dataframe for ggplot2 index <- seq_along(data) miss <- as.factor(is.na(data)) df <- data.frame(index, miss) ## ## End Preparations ## ## ## 3. Create the ggplot2 plot ## # Create the ggplot2 plot gg <- ggplot2::ggplot(df, ggplot2::aes(index, fill = miss)) + ggplot2::scale_fill_manual( values = c(color_existing, color_missing), labels = c("NAs", "non-NAs") ) + theme + ggplot2::theme( legend.position = "none", legend.title = ggplot2::element_blank(), plot.subtitle = ggtext::element_markdown(), panel.grid.major = ggplot2::element_blank(), panel.grid.minor.x = ggplot2::element_blank(), ) + ggplot2::scale_x_continuous(expand = c(0, 0)) + ggplot2::labs(title = title, subtitle = subtitle) + ggplot2::xlab(xlab) + ggplot2::ylab(ylab) count <- NULL if (measure == "percent") { gg <- gg + ggplot2::stat_bin(ggplot2::aes(y = ggplot2::after_stat(count / binwidth)), col = color_border, breaks = breaks, closed = "right" ) + ggplot2::scale_y_continuous(expand = c(0, 0), labels = function(x) paste0(x*100, "%")) } else { gg <- gg + ggplot2::stat_bin(ggplot2::aes(y = ggplot2::after_stat(count)), col = color_border, breaks = breaks, closed = "right" ) + ggplot2::scale_y_continuous(expand = c(0, 0)) } return(gg) } ================================================ FILE: R/ggplot_na_gapsize.R ================================================ #' @title Bar Plot to Visualize Occurrences of Different NA Gap Sizes #' #' @description Visualize the Number of Occurrences for existing NA Gap Sizes #' (NAs in a row) in a Time Series #' #' @param x Numeric Vector (\code{\link[base]{vector}}) or Time Series #' (\code{\link[stats]{ts}}) object containing NAs. This is the only mandatory #' parameter - all other parameters are only needed for adjusting the plot appearance. #' #' @param limit Specifies how many of the most common gap sizes are shown in #' the plot.Default is 10. So only the 10 most often occurring gapsizes will #' be shown. If more or all present gap sizes should be displayed, the limit needs #' to be increased. Since this might add a lot of additional data, having #' parameter \code{orientation} set to 'horizontal' avoids overlaps in the axis #' labels. #' #' @param include_total When set to TRUE the total NA count for a gapsize is #' included in the plot (total = number occurrence x gap size). #' E.g. if a gapsize of 3 occurs 10 times, this means this gap size makes #' up for 30 NAs in total. This can be a good indicator of the #' overall impact of a gapsize. #' #' @param ranked_by Should the results be sorted according to the number of #' occurrence or total resulting NAs for a gapsize. Total resulting NAs #' are calculated by (total = number occurrence x gap size). #' \itemize{ #' \item{"occurrence" - Sorting by 'number of occurrence' of a gap size} #' #' \item{"total" - Sorting by 'total resulting NAs' of a gap size} #' } #' #' The default setting is "occurrence". #' #' @param color_occurrence Defines the Color for the Bars of #' 'number of occurrence'. #' #' @param color_total Defines the color for the bars of #' 'total resulting NAs'. #' #' @param color_border Defines the color for the border of the bars. #' #' @param alpha_bars Alpha (transparency) value used for filling the bars. #' #' @param title Title of the Plot. #' #' @param subtitle Subtitle of the Plot. #' #' @param xlab Label for x-Axis. #' #' @param ylab Label for y-Axis. #' #' @param legend If TRUE a legend is added at the bottom. #' #' @param orientation Can be either 'vertical' or 'horizontal'. Defines #' if the bars are plotted vertically or horizontally. For large amounts #' of different gap sizes horizontal illustration is favorable (also see #' parameter \code{limit}). #' #' @param label_occurrence Defines the label assigned to 'number of occurrence' #' in the legend. #' @param label_total Defines the label assigned to 'total resulting NAs' #' in the legend. #' #' @param theme Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). #' (\code{\link[ggplot2]{theme_linedraw})} #' #' @author Steffen Moritz, Sebastian Gatscha #' #' @return The output is a \code{\link[ggplot2]{ggplot2}} object that can be #' further adjusted by using the ggplot syntax #' #' @details This plotting function can be used to visualize the length of #' the NA gaps (NAs in a row) in a time series. It shows a ranking of which #' gap sizes occur most often. This ranking can be ordered by the number #' occurrence of the gap sizes or by total resulting NAs for this gap size #' (occurrence * gap length). A NA-gap of 3 occurring 10 times means 30 total #' resulting NAs. #' #' A resulting plot can for example be described like this: #' a 2 NA-gap (2 NAs in a row) occurred 27 times, #' a 9 NA-gap (9 NAs in a row) occurred 11 times, #' a 27 NA-gap (27 NAs in a row) occurred 1 times, ... #' #' The only really needed parameter for this function is x (the univariate #' time series with NAs that shall be visualized). All other parameters #' are solely for altering the appearance of the plot. #' #' As long as the input is univariate and numeric, the function also takes #' data.frame, tibble, tsibble, zoo, xts as an input. #' #' The plot can be adjusted to your needs via the function parameters. #' Additionally, for more complex adjustments, the output can also be #' adjusted via ggplot2 syntax. This is possible, since the output #' of the function is a ggplot2 object. Also take a look at the Examples #' to see how adjustments are made. #' #' @seealso \code{\link[imputeTS]{ggplot_na_gapsize2}}, #' \code{\link[imputeTS]{ggplot_na_distribution}}, #' \code{\link[imputeTS]{ggplot_na_distribution2}}, #' \code{\link[imputeTS]{ggplot_na_imputations}} #' #' @examples #' # Example 1: Visualize the top gap sizes in tsNH4 (top 10 by default) #' ggplot_na_gapsize(tsNH4) #' #' # Example 2: Visualize the top gap sizes in tsAirgap - horizontal bars #' ggplot_na_gapsize(tsAirgap, orientation = "vertical") #' #' # Example 3: Same as example 1, just written with pipe operator #' tsNH4 %>% ggplot_na_gapsize() #' #' # Example 4: Visualize the top 20 gap sizes in tsNH4 #' ggplot_na_gapsize(tsNH4, limit = 20) #' #' # Example 5: Visualize top gap sizes in tsNH4 without showing total NAs #' ggplot_na_gapsize(tsNH4, limit = 20, include_total = FALSE) #' #' # Example 6: Visualize top gap sizes in tsNH4 but ordered by total NAs #' # (total = occurrence * gap length) #' ggplot_na_gapsize(tsNH4, limit = 20, ranked_by = "total") #' #' # Example 7: Visualize top gap sizes in tsNH4 - different theme #' # Plot adjustments via ggplot_na_gapsize function parameters #' ggplot_na_gapsize(tsNH4, theme = ggplot2::theme_classic()) #' #' # Example 8: Visualize top gap sizes in tsNH4 - title, subtitle in center #' # Plot adjustments via ggplot2 syntax #' ggplot_na_gapsize(tsNH4) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + #' ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) #' #' # Example 9: Visualize top gap sizes in tsNH4 - title in center, no subtitle #' # Plot adjustments via ggplot2 syntax and function parameters #' ggplot_na_gapsize(tsNH4, subtitle = NULL) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) #' #' # Example 10: Top gap sizes in tsNH4 - legend on the right and color change #' # Plot adjustments via ggplot2 syntax and function parameters #' ggplot_na_gapsize(tsNH4, color_total = "grey") + #' ggplot2::theme(legend.position = "right") #' @importFrom magrittr %>% #' #' @importFrom ggplot2 theme_linedraw ggplot geom_bar position_dodge aes scale_x_discrete #' scale_fill_manual ggtitle xlab ylab theme element_text element_blank #' coord_flip theme_classic #' #' @export ggplot_na_gapsize <- function(x, limit = 10, include_total = TRUE, ranked_by = "occurrence", color_occurrence = "indianred", color_total = "steelblue", color_border = "black", alpha_bars = 1, title = "Occurrence of gap sizes", subtitle = "Gap sizes (NAs in a row) ordered by most common", xlab = NULL, ylab = "Number occurrence", legend = TRUE, orientation = "horizontal", label_occurrence = "Number occurrence gapsize", label_total = "Resulting NAs for gapsize", theme = ggplot2::theme_linedraw()) { data <- x ## ## 1. Input Check and Transformation ## # 1.1 special handling data types if (any(class(data) == "tbl_ts")) { data <- as.vector(as.data.frame(data)[, 2]) } else if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.2 Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { stop("x is not univariate. The function only works with univariate input for x. For data types with multiple variables/columns only input the column you want to plot as parameter x.") } # 1.3 Checks and corrections for wrong data dimension # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.4 Input as vector data <- as.vector(data) # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric") } # 1.6 Check preconditions about amount of NAs # exclude NA only inputs missindx <- is.na(data) if (all(missindx)) { stop("Input data consists only of NAs. At least one non-NA numeric value is needed for creating a meaningful ggplot_na_gapsize plot)") } # exclude inputs without NAs if (!anyNA(data)) { stop("Input data contains no NAs. At least one missing value is needed to create a meaningful ggplot_na_gapsize plot)") } ## ## End Input Check and Transformation ## ## ## 2. Preparations ## # 2.1 Create required data # Calculation consecutive NA information rle_na <- base::rle(is.na(data)) vec <- rle_na$lengths[rle_na$values] occurrence_bar <- table(vec) gaps_vec <- as.integer(names(occurrence_bar)) totals_bar <- occurrence_bar * gaps_vec labels1 <- paste0(gaps_vec, " NA-gap") # 2.2 Adjust to parameter selection by user # Sorting for ranked_by param if (ranked_by == "occurrence") { # sort according to occurrence of gapsizes fooind <- order(occurrence_bar) occurrence_bar <- occurrence_bar[fooind] totals_bar <- totals_bar[fooind] labels1 <- labels1[fooind] } else if (ranked_by == "total") { # sort accoding to total NAs fooind <- order(totals_bar) occurrence_bar <- occurrence_bar[fooind] totals_bar <- totals_bar[fooind] labels1 <- labels1[fooind] } else { stop("Wrong input for parameter ranked_by. Input must be either 'occurrence' or 'total'. Call ?ggplot_na_gapsize to view the documentation.") } # Adjust to show only a limited amount of bars for limit param if (length(occurrence_bar) > limit) { occurrence_bar <- occurrence_bar[(length(occurrence_bar) - limit + 1):length(occurrence_bar)] totals_bar <- totals_bar[(length(totals_bar) - limit + 1):length(totals_bar)] labels1 <- labels1[(length(labels1) - limit + 1):length(labels1)] } # 2.3 Create dataframe for ggplot2 # data.frame for ggplot id <- seq_along(occurrence_bar) val <- c(occurrence_bar, totals_bar) label <- c( rep("occurrence_bar", length(occurrence_bar)), rep("totals_bar", length(totals_bar)) ) df <- data.frame(id, val, label) # Only number of occurrences bar if (include_total == FALSE) { df <- subset(df, label == "occurrence_bar") } ## ## End Preparations ## ## ## 3. Create the ggplot2 plot ## # Create ggplot gg <- ggplot2::ggplot(data = df) + ggplot2::geom_bar(aes(x = id, y = val, fill = label), color = color_border, width= 0.6, alpha = alpha_bars, stat = "identity", position = position_dodge(width = 0.7) ) + ggplot2::scale_x_discrete( labels = labels1, limits = labels1 ) + ggplot2::scale_fill_manual( values = c(color_occurrence, color_total), labels = c(label_occurrence, label_total), ) + ggplot2::ggtitle(title, subtitle = subtitle) + ggplot2::xlab(xlab) + ggplot2::ylab(ylab) + theme + ggplot2::theme( legend.position = "bottom", axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), legend.title = ggplot2::element_blank() ) # For flipping from vertical to horizontal bars if (orientation == "horizontal") { gg <- gg + ggplot2::coord_flip() } # Removing legend if (!legend) { gg <- gg + ggplot2::theme( legend.position = "none", ) } ## ## End creating the ggplot2 plot ## return(gg) } ================================================ FILE: R/ggplot_na_gapsize2.R ================================================ #' @title Bubble Plot to Visualize Total NA Count of NA gap sizes #' #' @description Visualize the total NA count (gap size * occurrence) for #' the existing gaps sizes (NAs in a row). #' #' @param x Numeric Vector (\code{\link[base]{vector}}) or Time Series #' (\code{\link[stats]{ts}}) object containing NAs. This is the only #' mandatory parameter - all other parameters are only needed for adjusting #' the plot appearance. #' #' @param colors_bubbles Choose a color gradient that encodes lower to #' higher total NA counts. #' Color codes can be given as vector. Using color palettes from colorspace, #' grDevices, RColorBrewer or other packages is useful here. #' E.g. grDevices::heat.colors(10) would be a possible input. #' #' @param color_border Color for the border of the bubbles. #' #' @param alpha_bubbles Alpha (transparency) value used for filling the bubbles. #' #' @param labels_bubbles Should labels be added to the individual bubbles inside #' the plot. #' For many datasets there will be overplotting issues once labels are added. #' In these cases using the min_gapsize, min_totals or min_occurrence options #' might be useful to only display the most relevant gap sizes. #' #' You can choose between these labels to be added: #' \itemize{ #' \item{"none" - No label gets added to the bubbles} #' (default choice) #' #' \item{"gap" - Adds a label displaying the gap size belonging to the #' respective bubble} #' #' \item{"total" - Adds a label displaying the total NA count for the #' respective bubble} #' #' \item{"gap-occurrence" - Adds a label displaying the respective #' gap size and number of its occurrence} #' } #' #' The default setting is "none". #' #' @param size_bubbles Allows to scale the size of the bubbles. #' Some experimenting with this parameter might be needed to get #' a good visualization for your specific dataset. #' #' @param min_totals Only print bubbles for gap sizes that account #' for at least min_totals NAs in the time series. #' #' @param min_occurrence Only print bubbles for gap sizes that occur at least #' min_occurrence times in the time series. #' #' @param min_gapsize Only show gap sizes larger than min_gapsize. Together with #' max_gapsize enables zooming into in certain regions of interest. #' #' @param max_gapsize Only show gapsizes smaller than max_gapsize. Together with #' min_gapsize enables zooming into in certain regions of interest. #' #' @param title Title of the Plot. #' #' @param subtitle Subtitle of the Plot. #' #' @param xlab Label for x-Axis. #' #' @param ylab Label for y-Axis. #' #' @param legend If TRUE a legend is added on the right side #' #' @param legend_breaks Number of displayed breaks / labels in the legend. #' Needs an integer giving the desired number of breaks as input. Breakpoints are #' internally calculated by R's pretty() function, which can also lead to #' values slightly smaller or larger than the desired number. #' #' #' @param legend_title Defines the title of the legend. #' #' @param legend_position Defines position of the legend. Choose either #' 'bottom', right', 'left' or 'top'. #' #' @param legend_point_sizes Defines the size of the symbols representing the total #' NA bubbles in the legend. #' #' You can choose between "default", "actual" or a custom vector of sizes. #' #' \itemize{ #' \item{"default" - Scales the points in the legend to symbolically #' resemble the size differences} (default choice) #' #' \item{"actual" - Scales the points in the legend according #' to their actual size in the plot} #' } #' #' Since these two options are not be always sufficient, a custom vector of #' sizes can be used as input. This would look like this: c(4,5,6,7). Be #' aware, that the length of this vector must match the number of breakpoints #' (can be adjusted with legend_breaks). #' #' @param theme Set a theme for ggplot2. Default is ggplot2::theme_linedraw(). #' (\code{\link[ggplot2]{theme_linedraw})} #' #' @author Steffen Moritz #' #' @return The output is a \code{\link[ggplot2]{ggplot2}} object that can be #' further adjusted by using the ggplot syntax #' #' @details This function visualizes total NA counts by individual gap size #' (consecutive NAs) in a time series. The bubble plot makes it easy to see #' which gap sizes account for most of the NAs in the series. The size and #' color of the bubbles represent the total number of NAs a given gap size #' accounts for. #' #' Total NAs for a gap size are calculated as follows: #' total NAs = occurrence * gap length #' #' For example, interpret a bubble for gap size 2 as follows: #' a 2-NA gap (two NAs in a row) occurred 27 times in the time series and thus #' accounts for 54 total NAs. #' #' On the x-axis, the different gap sizes are plotted in increasing order. #' The y-axis shows the occurrence count of these gap sizes in the time series. #' #' The plot is useful for investigating possible root causes of the missing #' data. It can indicate whether the missing data are random or whether there #' are patterns of interest. #' #' Depending on the input time series, there might be too much information in #' the plot, leading to overplotting. In these cases, use the parameters #' \code{min_totals}, \code{min_occurrence}, and \code{min_gapsize} to display #' only the information of interest. #' #' The only required parameter is \code{x} (the univariate time series with NAs #' to visualize). All other parameters alter the appearance of the plot. #' #' As long as the input is univariate and numeric, the function also accepts #' \code{data.frame}, \code{tibble}, \code{tsibble}, \code{zoo}, or \code{xts} #' input. #' #' The plot can be adjusted via function parameters. For more complex #' adjustments, you can modify the result using ggplot2 syntax, since the #' function returns a ggplot2 object. See the Examples for typical adjustments. #' #' #' @seealso \code{\link[imputeTS]{ggplot_na_distribution}}, #' \code{\link[imputeTS]{ggplot_na_distribution2}}, #' \code{\link[imputeTS]{ggplot_na_gapsize}}, #' \code{\link[imputeTS]{ggplot_na_imputations}} #' #' @examples #' # Example 1: Visualize total NA counts in tsNH4 #' ggplot_na_gapsize2(tsNH4) #' #' # Example 2: Visualize total NA counts in tsNH4, different color gradient #' ggplot_na_gapsize2(tsNH4, colors_bubbles = rev(grDevices::heat.colors(10))) #' #' # Example 3: Same as example 1, just written with pipe operator #' tsNH4 %>% ggplot_na_gapsize2() #' #' # Example 4: Visualize total NA counts in tsHeating #' # Limited to gap sizes that account for a total of > 600 NAs #' ggplot_na_gapsize2(tsHeating, min_totals = 600) #' #' # Example 5: Visualize total NA counts in tsNH4 - no legend #' ggplot_na_gapsize2(tsNH4, legend = FALSE) #' #' # Example 6: Visualize total NA counts in tsAirgap - increased bubble size #' ggplot_na_gapsize2(tsAirgap, size_bubbles = 35) #' #' # Example 7: Visualize total NA counts in tsNH4 #' # Plot adjustments via ggplot_na_gapsize2 function parameters #' ggplot_na_gapsize2(tsNH4, theme = ggplot2::theme_classic()) #' #' # Example 8: Visualize total NA counts in tsNH4 - title, subtitle in center #' # Plot adjustments via ggplot2 syntax #' ggplot_na_gapsize2(tsNH4) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + #' ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) #' #' # Example 9: Visualize total NA counts in tsNH4 - title in center, no subtitle #' # Plot adjustments via ggplot2 syntax and function parameters #' ggplot_na_gapsize2(tsNH4, subtitle = NULL) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) #' #' # Example 10: Total NA counts in tsNH4 - legend on the bottom and color change #' # Plot adjustments via ggplot2 syntax and function parameters #' ggplot_na_gapsize2(tsNH4, colors_bubbles = grDevices::heat.colors(10)) + #' ggplot2::theme(legend.position = "bottom") #' @importFrom magrittr %>% #' #' @importFrom ggplot2 theme_linedraw ggplot aes geom_point scale_size_identity #' geom_text scale_x_continuous scale_y_continuous scale_fill_gradientn #' guide_legend ggtitle xlab ylab theme element_text theme_classic #' #' @importFrom grDevices heat.colors #' #' @export ggplot_na_gapsize2 <- function(x, colors_bubbles = c("#FCFBFF", "#EFEEFA", "#DDDAEF", "#C8C3E2", "#B1AAD4", "#9A8FC4", "#8273B5", "#6B56A7", "#553695", "#3D1778"), color_border = "black", alpha_bubbles = 0.4, labels_bubbles = "none", size_bubbles = 25, min_totals = NULL, min_occurrence = NULL, min_gapsize = NULL, max_gapsize = NULL, title = "Gap Size Analysis", subtitle = "Total NA counts for different gapsizes", xlab = "Gapsize", ylab = "Number occurrence", legend = TRUE, legend_breaks = 4, legend_title = "Total NAs", legend_position = "right", legend_point_sizes = "default", theme = ggplot2::theme_linedraw()) { data <- x ## ## 1. Input Check and Transformation ## # 1.1 special handling data types if (any(class(data) == "tbl_ts")) { data <- as.vector(as.data.frame(data)[, 2]) } else if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.2 Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { stop("x is not univariate. The function only works with univariate input for x. For data types with multiple variables/columns only input the column you want to plot as parameter x.") } # 1.3 Checks and corrections for wrong data dimension # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.4 Input as vector data <- as.vector(data) # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric") } # 1.6 Check preconditions about amount of NAs # exclude NA only inputs missindx <- is.na(data) if (all(missindx)) { stop("Input data consists only of NAs. At least one non-NA numeric value is needed for creating a meaningful ggplot_na_gapsize2 plot)") } # exclude inputs without NAs if (!anyNA(data)) { stop("Input data contains no NAs. At least one missing value is needed to create a meaningful ggplot_na_gapsize2 plot)") } ## ## End Input Check and Transformation ## ## ## 2. Preparations ## # 2.1 Create required data # Calculation consecutive NA information rle_na <- base::rle(is.na(data)) vec <- rle_na$lengths[rle_na$values] gap_table <- table(vec) gap_names <- as.integer(names(gap_table)) occurrences <- as.integer(gap_table) totals <- occurrences * gap_names # 2.2 Create dataframe for ggplot2 df <- data.frame(gap = gap_names, occurrence = occurrences, total = totals) # 2.3 Adjust data to user selected parameters / filter # Filters to display only subsets of the data # Maximum Gapsize if (!is.null(max_gapsize)) { df <- subset(df, gap <= max_gapsize) } # Minimum gapsize if (!is.null(min_gapsize)) { df <- subset(df, gap >= min_gapsize) } # Minimum Total NAs if (!is.null(min_totals)) { df <- subset(df, total >= min_totals) } # Minimum Occurrence NAs if (!is.null(min_occurrence)) { df <- subset(df, occurrence >= min_occurrence) } # Error for too restrictive filters leaving no NA data to display if (length(df$gap) < 1) { stop("Too restrictive filter options set - nothing to display left. Your setting of either max_gapsize, min_gapsize, min_totals, min_occurrence or the combination of them left no NA data to display.)") } # 2.4 Calculate legend breaks and sizes # Create legend break points with pretty function. # Only use points within limits - otherwise there will be an error leg_breaks <- base::pretty(df$total, n = legend_breaks) leg_breaks <- leg_breaks[leg_breaks >= min(df$total) & leg_breaks <= max(df$total)] # Prevent empty breaks, when pretty() only chooses values outside limits if (length(leg_breaks) == 0) { leg_breaks <- totals[1] } # Define size of points in legend # Manual definition of legend point size if (is.numeric(legend_point_sizes)) { if (length(legend_point_sizes) == length(leg_breaks)) { leg_sizes <- legend_point_sizes } else { stop("When you input your own custom values for the size of the points in the legend, make sure your vector has the same size as are breaks in the legend.") } } # Scale points in the legend with a symbolic, sensible size else if (legend_point_sizes == "default") { leg_sizes <- seq(from = 3, by = 2, length.out = length(leg_breaks)) } # Scale points in the legend according to their actual size in the plot else if (legend_point_sizes == "actual") { leg_sizes <- leg_breaks / (max(df$total) / size_bubbles) } else { stop("Wrong values for parameter legend_pont_sizes chosen. To influence the size of points in the legend, either choose 'default', 'actual' or give a vector with your own desired sizes. This custom vector needs to have exactly as many elements as the legend has breaks") } ## ## End Preparations ## ## ## 3. Create the ggplot2 plot ## # Workaround for 'no visible binding' check() caused by ggplot2 vars gap <- df$gap occurrence <- df$occurrence total <- df$total # Create ggplot gg <- ggplot2::ggplot(data = df, ggplot2::aes(x = gap, y = occurrence)) + ggplot2::geom_point( alpha = alpha_bubbles, ggplot2::aes( fill = total, size = total / (max(total) / size_bubbles) ), color = color_border, pch = 21 ) + ggplot2::scale_size_identity() # What to appear in the label, default no label if (labels_bubbles == "gap-occurrence") { gg <- gg + ggplot2::geom_text(ggplot2::aes(label = paste0(gap, "-gap\n", occurrence, "x")), size = 2, alpha = 1, color = "black" ) } else if (labels_bubbles == "gap") { gg <- gg + ggplot2::geom_text(ggplot2::aes(label = paste0(gap, "-gap")), size = 2, alpha = 1, color = "black" ) } else if (labels_bubbles == "total") { gg <- gg + ggplot2::geom_text(ggplot2::aes(label = paste0(total)), size = 2, alpha = 1, color = "black" ) } else if (labels_bubbles == "occurrence") { gg <- gg + ggplot2::geom_text(ggplot2::aes(label = paste0(occurrence,"x")), size = 2, alpha = 1, color = "black" ) } gg <- gg + ggplot2::scale_x_continuous( expand = c(0.1, 0.1), breaks = function(x) unique(floor(base::pretty(seq(0, (max(x) + 1) * 1.1)))) ) + ggplot2::scale_y_continuous( expand = c(0.1, 0.1), breaks = function(x) unique(floor(base::pretty(seq(0, (max(x) + 1) * 1.1)))) ) + ggplot2::scale_fill_gradientn( colors = colors_bubbles, breaks = leg_breaks, guide = ggplot2::guide_legend( title = legend_title, override.aes = list(size = leg_sizes) ) ) + ggplot2::ggtitle(title, subtitle = subtitle) + ggplot2::xlab(xlab) + ggplot2::ylab(ylab) + theme + ggplot2::theme( legend.position = legend_position, axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), ) # Removing legend if (!legend) { gg <- gg + ggplot2::theme( legend.position = "none", ) } ## ## End creating the ggplot2 plot ## return(gg) } ================================================ FILE: R/ggplot_na_imputations.R ================================================ #' @title Line Plot to Visualize Imputed Values #' #' @description Visualize the imputed values in a time series. #' #' @param x_with_na Numeric Vector or Time Series (\code{\link{ts}}) object #' with NAs before imputation. This parameter and x_with_imputation shave to #' be set. The rest of the parameters are mostly needed for adjusting the plot #' appearance. #' #' @param x_with_imputations Numeric Vector or Time Series (\code{\link{ts}}) #' object with NAs replaced by imputed values. This parameter and #' x_with_imputation shave to be set.The rest of the parameters are mostly #' needed for adjusting the plot appearance. #' #' @param x_with_truth Numeric Vector or Time Series (\code{\link{ts}}) object #' with the real values (optional parameter). If the ground truth is known #' (e.g. in experiments where the missing values were artificially added) #' it can be displayed in the plot with this parameter. #' Default is NULL (ground truth not known). #' #' @param x_axis_labels For adding specific x-axis labels. Takes a vector of #' \code{\link[base]{Date}} or \code{\link[base]{POSIXct}} objects as an input #' (needs the same length as x_with_na). #' The Default (NULL) uses the observation numbers as x-axis tick labels. #' #' @param title Title of the Plot. #' #' @param subtitle Subtitle of the Plot. #' #' @param xlab Label for x-Axis. #' #' @param ylab Label for y-Axis. #' #' @param color_points Color for the Symbols/Points of the non-NA Observations. #' #' @param color_imputations Color for the Symbols/Points of the Imputed Values. #' #' @param color_truth Color for the Symbols/Points of the NA value Ground Truth #' (only relevant when x_with_truth available). #' #' @param shape_points Shape for the Symbols/Points of the non-NA observations. #' See https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference. #' #' @param shape_imputations Shape for the Symbols/Points of the imputed values. #' See https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference. #' #' @param shape_truth Shape for the Symbols/Points of the NA value Ground Truth #' (only relevant when x_with_truth available). #' #' @param size_points Size for the Symbols/Points of the non-NA Observations. #' #' @param size_imputations Size for the Symbols/Points of the Imputed Values. #' #' @param size_truth Size for the Symbols/Points of the NA value Ground Truth #' (only relevant when x_with_truth available). #' #' @param color_lines Color for the Lines connecting the Observations/Points. #' #' @param width_lines Width for the Lines connecting the Observations/Points. #' #' @param linetype Linetype for the Lines connecting the Observations/Points. #' #' @param connect_na If TRUE the Imputations are connected #' to the non-NA observations in the plot. Otherwise there are no #' connecting lines between symbols in NA areas. #' #' @param legend If TRUE a Legend is added at the bottom. #' #' @param legend_size Size of the Symbols used in the Legend. #' #' @param label_known Legend label for the non-NA Observations. #' #' @param label_imputations Legend label for the Imputed Values. #' #' @param label_truth Legend label for the Ground Truth of the NA values. #' #' @param theme Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). #' (\code{\link[ggplot2]{theme_linedraw})} #' #' @details This plot can be used, to visualize imputed values for a time #' series. Imputed values (filled NA gaps) are shown in a different color #' than the other values. If real values (ground truth) for the NA gaps are known, #' they can be optionally added in a different color. #' #' The only really needed parameters for this function are x_with_na #' (the time series with NAs before imputation) and x_with_imputations #' (the time series without NAs after imputation). All other parameters #' are msotly for altering the appearance of the plot. #' #' As long as the input is univariate and numeric the function also takes #' data.frame, tibble, tsibble, zoo, xts as an input. #' #' The plot can be adjusted to your needs via the function parameters. #' Additionally, for more complex adjustments, the output can also be #' adjusted via ggplot2 syntax. This is possible, since the output #' of the function is a ggplot2 object. Also take a look at the Examples #' to see how adjustments are made. #' #' @author Steffen Moritz, Sebastian Gatscha #' #' #' @seealso \code{\link[imputeTS]{ggplot_na_distribution}}, #' \code{\link[imputeTS]{ggplot_na_distribution2}}, #' \code{\link[imputeTS]{ggplot_na_gapsize}}, #' \code{\link[imputeTS]{ggplot_na_gapsize2}} #' #' @examples #' # Example 1: Visualize imputation by na_mean #' imp_mean <- na_mean(tsAirgap) #' ggplot_na_imputations(tsAirgap, imp_mean) #' #' #' # Example 2: Visualize imputation by na_locf and added ground truth #' imp_locf <- na_locf(tsAirgap) #' ggplot_na_imputations(x_with_na = tsAirgap, #' x_with_imputations = imp_locf, #' x_with_truth = tsAirgapComplete #' ) #' #' #' # Example 3: Visualize imputation by na_kalman #' imp_kalman <- na_kalman(tsAirgap) #' ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_kalman) #' #' #' # Example 4: Same as example 1, just written with pipe operator #' tsAirgap %>% #' na_mean() %>% #' ggplot_na_imputations(x_with_na = tsAirgap) #' #' #' # Example 5: Visualize imputation by na_seadec - different color for imputed points #' # Plot adjustments via ggplot_na_imputations function parameters #' imp_seadec <- na_seadec(tsAirgap) #' ggplot_na_imputations(x_with_na = tsAirgap, #' x_with_imputations = imp_seadec, #' color_imputations = "gold") #' #' #' # Example 6: Visualize imputation - different theme, point size imputations #' # Plot adjustments via ggplot_na_imputations function parameters #' imp_seadec <- na_seadec(tsAirgap) #' ggplot_na_imputations(x_with_na = tsAirgap, #' x_with_imputations = imp_seadec, #' theme = ggplot2::theme_classic(), #' size_imputations = 5) #' #' #' # Example 7: Visualize imputation - title, subtitle in center #' # Plot adjustments via ggplot2 syntax #' imp_seadec <- na_seadec(tsAirgap) #' ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_seadec) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + #' ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) #' #' #' # Example 8: Visualize imputation - title in center, no subtitle #' # Plot adjustments via ggplot2 syntax and function parameters #' imp_mean <- na_mean(tsAirgap) #' ggplot_na_imputations(x_with_na = tsAirgap, #' x_with_imputations = imp_mean, #' subtitle = NULL) + #' ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) #' #' @importFrom magrittr %>% #' #' @importFrom ggplot2 theme_linedraw ggplot geom_line aes geom_point #' scale_color_manual element_blank xlab ylab ggtitle guides guide_legend #' theme theme_classic #' #' #' @export ggplot_na_imputations <- function(x_with_na, x_with_imputations, x_with_truth = NULL, x_axis_labels = NULL, title = "Imputed Values", subtitle = "Visualization of missing value replacements", xlab = "Time", ylab = "Value", color_points = "steelblue", color_imputations = "indianred", color_truth = "seagreen3", color_lines = "lightslategray", shape_points = 16, shape_imputations = 18, shape_truth = 16, size_points = 1.5, size_imputations = 2.5, size_truth = 1.5, width_lines = 0.5, linetype = "solid", connect_na = TRUE, legend = TRUE, legend_size = 5, label_known = "known values", label_imputations = "imputed values", label_truth = "ground truth", theme = ggplot2::theme_linedraw()) { ## ## 1. Input Check and Transformation ## # 1.1 special handling data types # x_with_na if (any(class(x_with_na) == "tbl_ts")) { x_with_na <- as.vector(as.data.frame(x_with_na)[, 2]) } else if (any(class(x_with_na) == "tbl")) { x_with_na <- as.vector(as.data.frame(x_with_na)[, 1]) } # x_with_imputations if (any(class(x_with_imputations) == "tbl_ts")) { x_with_imputations <- as.vector(as.data.frame(x_with_imputations)[, 2]) } else if (any(class(x_with_imputations) == "tbl")) { x_with_imputations <- as.vector(as.data.frame(x_with_imputations)[, 1]) } # x_with_truth if (any(class(x_with_truth) == "tbl_ts")) { x_with_truth <- as.vector(as.data.frame(x_with_truth)[, 2]) } else if (any(class(x_with_truth) == "tbl")) { x_with_truth <- as.vector(as.data.frame(x_with_truth)[, 1]) } # 1.2 Check if the input is multivariate if (!is.null(dim(x_with_na)[2]) && dim(x_with_na)[2] > 1) { stop("x_with_na is not univariate. The function only works with univariate input for x_with_na. For data types with multiple variables/columns only input the column you want to plot as parameter x_with_na.") } if (!is.null(dim(x_with_imputations)[2]) && dim(x_with_imputations)[2] > 1) { stop("x_with_imputations is not univariate. The function only works with univariate input for x_with_imputations. For data types with multiple variables/columns only input the column you want to plot as parameter x_with_imputations") } if (!is.null(dim(x_with_truth)[2]) && dim(x_with_truth)[2] > 1) { stop("x_with_na is not univariate. The function only works with univariate input for x_with_truth. For data types with multiple variables/columns only input the column you want to plot as parameter x_with_truth") } # 1.3 Checks and corrections for wrong data dimension # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(x_with_na)[2])) { x_with_na <- x_with_na[, 1] } if (!is.null(dim(x_with_imputations)[2])) { x_with_imputations <- x_with_imputations[, 1] } if (!is.null(dim(x_with_truth)[2])) { x_with_truth <- x_with_truth[, 1] } # 1.4 Input as vector x_with_na <- as.vector(x_with_na) x_with_imputations <- as.vector(x_with_imputations) x_with_truth <- as.vector(x_with_truth) # 1.5 Check if input is numeric if (!is.numeric(x_with_na)) { stop("Input x_with_na is not numeric") } if (!is.numeric(x_with_imputations)) { stop("Input x_with_imputations is not numeric") } if (!is.numeric(x_with_truth) && !is.null(x_with_truth)) { stop("Input x_with_truth is not numeric") } # 1.6 Same length of the series # x_with_na and x_with_imputations need same length if (length(x_with_na) != length(x_with_imputations)) { stop("Input x_with_na and x_with_imputations need to have the same length. x_with_na is the time series with NAs before imputation. x_with_imputations is the time series with filled NAs after applying imputation.") } # if x_with_truth available it needs also same length if (!is.null(x_with_truth) && (length(x_with_na) != length(x_with_truth))) { stop("Input x_with_na, x_with_imputations and x_with_truth need to have the same length. x_with_na is the time series with NAs before imputation. x_with_imputations is the time series with filled NAs after applying imputation. x_with_truth (optional) is the series with the ground truth for the imputed values") } # 1.7 Check preconditions about amount of NAs # Unwanted all NA inputs missindx_x_with_na <- is.na(x_with_na) if (all(missindx_x_with_na)) { stop("Input x_with_na consists only of NAs. Something with the input likely went wrong. Creating a ggplot_na_imputations plot does not make sense with an all NA input. This are the required inputs: x_with_na (time series before imputation that still has NAs), x_with_imputations (time series after imputation, where NAs were replaced by imputation") } missindx_x_with_imputations <- is.na(x_with_imputations) if (all(missindx_x_with_imputations)) { stop("Input x_with_imputations consists only of NAs. Something with the input likely went wrong. Creating a ggplot_na_imputations plot does not make sense with an all NA input. This are the required inputs: x_with_na (time series before imputation that still has NAs), x_with_imputations (time series after imputation, where NAs were replaced by imputation") } # Unwanted no NA inputs if (!anyNA(x_with_na)) { stop("Input x_with_na contains no NAs. At least one missing value is needed to create a meaningful ggplot_na_imputations plot) This are the required inputs: x_with_na (time series before imputation that still has NAs), x_with_imputations (time series after imputation, where NAs were replaced by imputation") } ## ## End Input Check and Transformation ## ## ## 2. Preparations ## # 2.1 Create dataframe for ggplot2 # Define x-axis label data # if Date or POSIXct given for x_axis_labels time information can be plotted if (any(class(x_axis_labels) == "Date")) { time <- x_axis_labels } else if (any(class(x_axis_labels) == "POSIXct")) { time <- x_axis_labels } else if (is.null(x_axis_labels)) { time <- seq_along(x_with_na) } else { stop("Input for x_axis_labels is not in a supported format, must be a vector of Date or a POSIXct objects with the same length as x_with_na and x_with_imputations") } if (!is.null(x_with_truth)) { df <- data.frame(time, x_with_imputations, x_with_na, x_with_truth) } else { df <- data.frame(time, x_with_imputations, x_with_na) } ## ## End Preparations ## ## ## 3. Create the ggplot2 plot ## # Create the plot gg <- ggplot2::ggplot(data = df) ## Add Lines # Don't connect the lines in the missing areas if (connect_na == FALSE) { gg <- gg + ggplot2::geom_line( data = df, ggplot2::aes(x = time, y = x_with_na), na.rm = TRUE, color = color_lines, linetype = linetype, linewidth = width_lines ) } # If truth available connect the true values in the missing areas else if (!is.null(x_with_truth)) { gg <- gg + ggplot2::geom_line( data = df, ggplot2::aes(x = time, y = x_with_truth), na.rm = TRUE, color = color_lines, linetype = linetype, linewidth = width_lines ) } # If no truth available connect the imputed values in the missing areas else { gg <- gg + ggplot2::geom_line( data = df, ggplot2::aes(x = time, y = x_with_imputations), na.rm = TRUE, color = color_lines, linetype = linetype, linewidth = width_lines ) } # Remove known values from imputations - to avoid overplotting df$x_with_imputations[!is.na(x_with_na)] <- NA if (!is.null(x_with_truth)) { df$x_with_truth[!is.na(x_with_na)] <- NA } ## Add points # Points for regular, known values gg <- gg + ggplot2::geom_point( data = df, ggplot2::aes(x = time, y = x_with_na, color = "1"), na.rm = TRUE, shape = shape_points, size = size_points ) # Points for Imputations gg <- gg + ggplot2::geom_point( data = df, ggplot2::aes(x = time, y = x_with_imputations, color = "2"), na.rm = TRUE, size = size_imputations, shape = shape_imputations ) # Points for truth if (!is.null(x_with_truth)) { gg <- gg + ggplot2::geom_point( data = df, ggplot2::aes(x = time, y = x_with_truth, color = "3"), na.rm = TRUE, shape = shape_truth, size = size_truth ) } if (!is.null(x_with_truth)) { gg <- gg + ggplot2::scale_color_manual( name = ggplot2::element_blank(), breaks = c("1", "2", "3"), labels = c(label_known, label_imputations, label_truth), values = c(color_points, color_imputations, color_truth) ) } else { gg <- gg + ggplot2::scale_color_manual( name = ggplot2::element_blank(), breaks = c("1", "2"), labels = c(label_known, label_imputations), values = c(color_points, color_imputations) ) } gg <- gg + ggplot2::ylab(ylab) + ggplot2::xlab(xlab) + ggplot2::ggtitle(label = title, subtitle = subtitle) + theme if (!is.null(x_with_truth)) { gg <- gg + ggplot2::guides(color = ggplot2::guide_legend( override.aes = list(size = legend_size, shape = c(shape_points, shape_imputations, shape_truth)) )) } else { gg <- gg + ggplot2::guides(color = ggplot2::guide_legend( override.aes = list(size = legend_size, shape = c(shape_points, shape_imputations)) )) } gg <- gg + ggplot2::theme( legend.position = base::ifelse(legend == TRUE, "bottom", "none"), legend.title = ggplot2::element_blank() ) ## ## End creating the ggplot2 plot ## return(gg) } ================================================ FILE: R/imputeTS-package.R ================================================ #' @keywords internal "_PACKAGE" #' @title imputeTS-package description #' #' @description #' The imputeTS package is a collection of algorithms and tools for univariate time series imputation. #' #' @details The imputeTS package specializes on (univariate) time series imputation. #' It offers several different imputation algorithm implementations. Beyond the imputation algorithms #' the package also provides plotting and printing functions of missing data statistics. #' #' The package is easy to use: #' #' - To impute (fill all missing values) in a time series \code{x}, run:\cr #' \code{na_interpolation(x)} \cr #' #' - To plot missing data statistics for a time series \code{x}, run:\cr #' \code{ggplot_na_distribution(x)}\cr #' #' - To print missing data statistics for a time series \code{x}, run:\cr #' \code{statsNA(x)}\cr #' #' Every other imputation function (starting with na_'algorithm name') and plotting #' function (starting with plotNA_'plot name') work the same way as in this example. #' #' @name imputeTS-package #' #' @references Moritz, Steffen, and Thomas Bartz-Beielstein. "imputeTS: Time Series Missing Value Imputation in R." R Journal 9.1 (2017). doi:10.32614/RJ-2017-009. #' #' @import stats #' @importFrom magrittr %>% #' @importFrom utils globalVariables #' @importFrom Rcpp sourceCpp #' @useDynLib imputeTS NULL .onUnload <- function (libpath) { library.dynam.unload("imputeTS", libpath) } utils::globalVariables(c("rule")) #' @export magrittr::`%>%` ================================================ FILE: R/internal_algorithm_interface.R ================================================ ##De-Roxygenized to avoid appearance in the package documentation # @title Algorithm selection (Internal function) # @description Internal function for choosing between the basic univariate imputation algortihms # @param x Supposed to be a univariate time series # @return Time Series (\code{\link{ts}}) object that fulfills the requirements # @author Steffen Moritz #' @import stats apply_base_algorithm <- function(x, algorithm, ...) { data <- x #checking for false input if(algorithm == "locf") { data <- na_locf(data, ...) } else if(algorithm == "mean") { data <- na_mean(data, ...) } else if(algorithm == "random") { data <- na_random(data, ...) } else if(algorithm == "interpolation") { data <- na_interpolation(data, ...) } else if(algorithm == "kalman") { data <- na_kalman(data, ...) } else if(algorithm == "ma") { data <- na_ma(data, ...) } else { stop("Wrong parameter for option algorithm chosen.") } return(data) } ================================================ FILE: R/na_interpolation.R ================================================ #' @title Missing Value Imputation by Interpolation #' #' @description Uses either linear, spline or stineman interpolation #' to replace missing values. #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' #' @param option Algorithm to be used. Accepts the following input: #' \itemize{ #' \item{"linear" - for linear interpolation using \link{approx} } (default choice) #' \item{"spline" - for spline interpolation using \link{spline}} #' \item{"stine" - for Stineman interpolation using \link[stinepack]{stinterp}} #' } #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @param ... Additional parameters to be passed through to \link{approx} or #' \link{spline} interpolation functions #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @details Missing values get replaced by values of \link{approx}, \link{spline} #' or \link[stinepack]{stinterp} interpolation. #' #' The na_interpolation function also supports the use of additional parameters from the respective #' underlying interpolation functions. While usually not really needed, it is useful to know that #' this advanced use is in principle possible. These additional parameters are not specified explicitly #' in the na_interpolation function documentation. Take a look into the documentation of the \link[stinepack]{stinterp}, \link{approx} and \link{spline} functions to get an overview about these additional parameters. #' #' An example for such a parameter is the 'method' argument of spline, which can be used to #' further specify the type of spline to be used. Possible values are "fmm", "natural", #' "periodic", "monoH.FC" and "hyman" (as can be seen in the \link{spline} #' documentation). The respective function call using this additional parameter would #' look like this: #' \code{na_interpolation(x, option ="spline", method ="natural")} #' #' Like in this example other additional detail parameters (gained from \link{approx}, #' \link{spline}, \link[stinepack]{stinterp} documentation) can be used by just including #' them in the na_interpolation function call. As already mentioned, these advanced possibilities #' for settings parameters are only helpful for specific use cases. For regular use #' the standard parameters provided directly in the na_interpolation documentation should be #' more than enough. #' #' #' @author Steffen Moritz, Ron Hause #' #' @seealso \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Prerequisite: Create Time series with missing values #' x <- ts(c(2, 3, 4, 5, 6, NA, 7, 8)) #' #' # Example 1: Perform linear interpolation #' na_interpolation(x) #' #' # Example 2: Perform spline interpolation #' na_interpolation(x, option = "spline") #' #' # Example 3: Perform stine interpolation #' na_interpolation(x, option = "stine") #' #' # Example 4: Perform linear interpolation, with additional parameter pass through from spline() #' # Take a look at the 'Details' section of the na_interpolation documentation #' # for more information about advanced parameter pass through options #' na_interpolation(x, option ="spline", method ="natural") #' #' # Example 5: Same as example 1, just written with pipe operator #' x %>% na_interpolation() #' #' # Example 6: Same as example 2, just written with pipe operator #' x %>% na_interpolation(option = "spline") #' @references Johannesson, Tomas, et al. (2015). "Package stinepack". #' @importFrom stats ts approx spline #' @importFrom methods hasArg #' @importFrom stinepack stinterp #' @importFrom magrittr %>% #' @export na_interpolation <- function(x, option = "linear", maxgap = Inf, ...) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_interpolation(data[, i], option, maxgap), error = function(cond) { warning(paste( "na_interpolation: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (sum(!missindx) < 2) { stop("At least 2 non-NA data points required in the time series to apply na_interpolation.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric.") } ## ## End Input Check ## ## ## 2. Imputation Code ## n <- length(data) allindx <- 1:n indx <- allindx[!missindx] data_vec <- as.vector(data) # Linear Interpolation if (option == "linear") { # Check if 'rule' is used in function call, to allow parameter pass through for rule # Needed since parameter pass through via (...) to approx does not work, when value for 'rule' is also set in the code. if (methods::hasArg(rule)) { interp <- stats::approx(indx, data_vec[indx], 1:n, ...)$y } else { interp <- stats::approx(indx, data_vec[indx], 1:n, rule = 2, ...)$y } } # Spline Interpolation else if (option == "spline") { interp <- stats::spline(indx, data_vec[indx], n = n, ...)$y } # Stineman Interpolation else if (option == "stine") { interp <- stinepack::stinterp(indx, data_vec[indx], 1:n, ...)$y # avoid NAs at the beginning and end of series // same behavior like # for approx with rule = 2. if (any(is.na(interp))) { interp <- na_locf(interp, na_remaining = "rev") } } # Wrong parameter option else { stop("Wrong parameter 'option' given. Value must be either 'linear', 'spline' or 'stine'.") } # Merge interpolated values back into original time series data[missindx] <- interp[missindx] ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/na_kalman.R ================================================ #' @title Missing Value Imputation by Kalman Smoothing and State Space Models #' #' @description Uses Kalman Smoothing on structural time series models #' (or on the state space representation of an arima model) for imputation. #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' #' @param model Model to be used. With this parameter the State Space Model #' (on which KalmanSmooth is performed) can be chosen. Accepts the following input: #' #' \itemize{ #' #' \item{"StructTS" - For using a structural model fitted by maximum #' likelihood (using \link[stats]{StructTS}) } (default choice) #' #' \item{"auto.arima" - For using the state space representation of #' arima model (using \link[forecast]{auto.arima})} #' #' } #' #' For both auto.arima and StructTS additional parameters for model building can #' be given with the \dots parameter #' #' Additionally it is also possible to use a user created state space model #' (See code Example 5). This state space model could for example be #' obtained from another R package for structural time series modeling. #' Furthermore providing the state space representation of a arima model #' from \link[stats]{arima} is also possible. But it is important to note, #' that user created state space models must meet the requirements specified #' under \link[stats]{KalmanLike}. This means the user supplied state space #' model has to be in form of a list with at least components T, Z, h , V, a, P, Pn. #' (more details under \link[stats]{KalmanLike}) #' #' @param smooth if \code{TRUE} - \code{\link[stats]{KalmanSmooth}} is used for #' estimation, if \code{FALSE} - \code{\link[stats]{KalmanRun}} is used. #' Since KalmanRun is often considered extrapolation KalmanSmooth is usually #' the better choice for imputation. #' #' @param nit Parameter from Kalman Filtering (see \link[stats]{KalmanLike}). #' Usually no need to change from default. #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @param ... Additional parameters to be passed through to the functions that #' build the State Space Models (\link[stats]{StructTS} or \link[forecast]{auto.arima}). #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @details The KalmanSmoother used in this function is \code{\link[stats]{KalmanSmooth}}. #' It operates either on a \code{Basic Structural Model} obtained by #' \code{\link[stats]{StructTS}} or the state space representation of a ARMA model #' obtained by \code{\link[forecast]{auto.arima}}. #' #' For an detailed explanation of Kalman Filtering and Space Space Models the #' following literature is a good starting point: #' \itemize{ #' \item{\cite{G. Welch, G. Bishop, An Introduction to the Kalman Filter. SIGGRAPH 2001 Course 8, 1995}} #' \item{\cite{Harvey, Andrew C. Forecasting, structural time series models and the Kalman filter. Cambridge university press, 1990} } #' \item{\cite{Grewal, Mohinder S. Kalman filtering. Springer Berlin Heidelberg, 2011}} #' } #' #' @author Steffen Moritz #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Example 1: Perform imputation with KalmanSmoother and state space representation of arima model #' na_kalman(tsAirgap) #' #' # Example 2: Perform imputation with KalmanRun and state space representation of arima model #' na_kalman(tsAirgap, smooth = FALSE) #' #' # Example 3: Perform imputation with KalmanSmooth and StructTS model #' na_kalman(tsAirgap, model = "StructTS", smooth = TRUE) #' #' # Example 4: Perform imputation with KalmanSmooth and StructTS model with additional parameters #' na_kalman(tsAirgap, model = "StructTS", smooth = TRUE, type = "trend") #' #' # Example 5: Perform imputation with KalmanSmooth and user created model #' usermodel <- arima(tsAirgap, order = c(1, 0, 1))$model #' na_kalman(tsAirgap, model = usermodel) #' #' # Example 6: Same as example 1, just written with pipe operator #' tsAirgap %>% na_kalman() #' @references Hyndman RJ and Khandakar Y (2008). "Automatic time series forecasting: the forecast package for R". Journal of Statistical Software, 26(3). #' @importFrom stats StructTS KalmanSmooth KalmanRun arima #' @importFrom forecast auto.arima #' @importFrom magrittr %>% #' @export na_kalman <- function(x, model = "StructTS", smooth = TRUE, nit = -1, maxgap = Inf, ...) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_kalman(data[, i], model, smooth, nit, maxgap, ...), error = function(cond) { warning(paste( "na_kalman: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (sum(!missindx) < 3) { stop("At least 3 non-NA data points required in the time series to apply na_kalman.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric.") } # 1.6 Check if type of parameter smooth is correct if (!is.logical(smooth)) { stop("Parameter smooth must be of type logical ( TRUE / FALSE).") } # 1.7 Transformation to numeric as 'int' can't be given to KalmanRun data[1:length(data)] <- as.numeric(data) # 1.8 Check for and mitigate all constant values in combination with StructTS # See https://github.com/SteffenMoritz/imputeTS/issues/26 if (is.character(model) && model == "StructTS" && length(unique(as.vector(data))) == 2) { return(na_interpolation(x)) } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## # 2.1 Selection of state space model # State space representation of a arima model if (model[1] == "auto.arima") { mod <- forecast::auto.arima(data, ...)$model } # State space model, default is BSM - basic structural model else if (model[1] == "StructTS") { # Fallback, in StructTS first value is not allowed to be NA, thus take first non-NA if (is.na(data[1])) { data[1] <- data[which.min(is.na(data))] } mod <- stats::StructTS(data, ...)$model0 } # User supplied model e.g. created with arima() or other state space models from other packages else { mod <- model if (length(mod) < 7) { stop("Parameter model has either to be \"StructTS\"/\"auto.arima\" or a user supplied model in form of a list with at least components T, Z, h , V, a, P, Pn specified.") } if (is.null(mod$Z)) { stop("Something is wrong with the user supplied model. Either choose \"auto.arima\" or \"StructTS\" or supply a state space model with at least components T, Z, h , V, a, P, Pn as specified under Details on help page for KalmanLike.") } } # 2.2 Selection if KalmanSmooth or KalmanRun if (smooth == TRUE) { kal <- stats::KalmanSmooth(data, mod, nit) erg <- kal$smooth # for kalmanSmooth } else { kal <- stats::KalmanRun(data, mod, nit) erg <- kal$states # for kalmanrun } # Check if everything is right with the model if (dim(erg)[2] != length(mod$Z)) { stop("Error with number of components $Z.") } # 2.3 Getting Results # Out of all components in $states or$smooth only the ones # which have 1 or -1 in $Z are in the model # Therefore matrix multiplication is done karima <- erg[missindx, , drop = FALSE] %*% as.matrix(mod$Z) # Add imputations to the initial dataset data[missindx] <- karima ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/na_locf.R ================================================ #' @title Missing Value Imputation by Last Observation Carried Forward #' #' @description Replaces each missing value with the most recent present value #' prior to it (Last Observation Carried Forward- LOCF). Optionally this can #' also be done starting from the back of the series (Next Observation Carried #' Backward - NOCB). #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' #' @param option Algorithm to be used. Accepts the following input: #' \itemize{ #' \item{"locf" - for Last Observation Carried Forward} (default choice) #' \item{"nocb" - for Next Observation Carried Backward} #' } #' #' @param na_remaining Method to be used for remaining NAs. #' \itemize{ #' \item{"rev" - to perform nocb / locf from the reverse direction} (default choice) #' \item{"keep" - to return the series with NAs} #' \item{"rm" - to remove remaining NAs} #' \item{"mean" - to replace remaining NAs by overall mean} #' } #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @details #' #' ## General Functionality #' Replaces each missing value with the most recent present value #' prior to it (Last Observation Carried Forward - LOCF). This can also be #' done in reverse direction, starting from the end of the series (then #' called Next Observation Carried Backward - NOCB). #' #' #' ## Handling for NAs at the beginning of the series #' In case one or more successive observations directly at the start of the #' time series are NA, there exists no 'last value' yet, that can be carried #' forward. Thus, no LOCF imputation can be performed for these NAs. As soon #' as the first non-NA value appears, LOCF can be performed as expected. The #' same applies to NOCB, but from the opposite direction. #' #' While this problem might appear seldom and will only affect a very small #' amount of values at the beginning, it is something to consider. #' The \code{na_remaining} parameter helps to define, what should happen #' with these values at the start, that would remain NA after pure LOCF. #' #' Default setting is \code{na_remaining = "rev"}, which performs #' nocb / locf from the other direction to fill these NAs. So a NA #' at the beginning will be filled with the next non-NA value appearing #' in the series. #' #' With \code{na_remaining = "keep"} NAs at the beginning (that can not #' be imputed with pure LOCF) are just left as remaining NAs. #' #' With \code{na_remaining = "rm"} NAs at the beginning of the series are #' completely removed. Thus, the time series is basically shortened. #' #' Also available is \code{na_remaining = "mean"}, which uses the overall #' mean of the time series to replace these remaining NAs. (but beware, #' mean is usually not a good imputation choice - even if it only affects #' the values at the beginning) #' #' @author Steffen Moritz #' #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_kalman}}, #' \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Prerequisite: Create Time series with missing values #' x <- ts(c(NA, 3, 4, 5, 6, NA, 7, 8)) #' #' # Example 1: Perform LOCF #' na_locf(x) #' #' # Example 2: Perform NOCF #' na_locf(x, option = "nocb") #' #' # Example 3: Perform LOCF and remove remaining NAs #' na_locf(x, na_remaining = "rm") #' #' # Example 4: Same as example 1, just written with pipe operator #' x %>% na_locf() #' @importFrom stats ts #' @importFrom magrittr %>% #' @export na_locf <- function(x, option = "locf", na_remaining = "rev", maxgap = Inf) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_locf(data[, i], option, na_remaining, maxgap), error = function(cond) { warning(paste( "na_locf: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (all(missindx)) { stop("Input data has only NA values. At least 1 non-NA data point required in the time series to apply na_locf.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric.") } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## # 2.1 Perform locf or nocb # Input as vector data_vec <- as.vector(data) # Last observation carried forward // f = 0 if (option == "locf") { imputed <- locf(data_vec, FALSE) } # Next observation carried backward // f = 1 else if (option == "nocb") { imputed <- locf(data_vec, TRUE) } # Wrong input else { stop("Wrong parameter 'option' given. Value must be either 'locf' or 'nocb'.") } data[missindx] <- imputed[missindx] # 2.2 Handle remaining NAs - na_remaining param # no remaining NAs or keep NAs selected -> do nothing if (!anyNA(data) || na_remaining == "keep") { # do nothing } # Replace NAs through locf/nocb from the other direction else if (na_remaining == "rev") { if (option == "locf") { data <- na_locf(data, option = "nocb") } else if (option == "nocb") { data <- na_locf(data, option = "locf") } } # Remove all NAs else if (na_remaining == "rm") { data <- na_remove(data) } # Replace NAs with overall mean else if (na_remaining == "mean") { data <- na_mean(data) } # Wrong Input else { stop("Wrong parameter 'na_remaining' given. Value must be either 'keep', 'rm', 'mean' or 'rev'.") } ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/na_ma.R ================================================ #' @title Missing Value Imputation by Weighted Moving Average #' #' @description Missing value replacement by weighted moving average. #' Uses semi-adaptive window size to ensure all NAs are replaced. #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' #' @param weighting Weighting to be used. Accepts the following input: #' \itemize{ #' \item{"simple" - Simple Moving Average (SMA)} #' \item{"linear" - Linear Weighted Moving Average (LWMA)} #' \item{"exponential" - Exponential Weighted Moving Average (EWMA)} (default choice) #' } #' #' @param k integer width of the moving average window. Expands to both sides #' of the center element e.g. k=2 means 4 observations (2 left, 2 right) are #' taken into account. If all observations in the current window are NA, the #' window size is automatically increased until there are at least 2 non-NA #' values present. #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @details In this function missing values get replaced by moving average #' values. Moving Averages are also sometimes referred to as "moving mean", #' "rolling mean", "rolling average" or "running average". #' #' The mean in this implementation taken from an equal number of observations #' on either side of a central value. This means for an NA value at position #' \code{i} of a time series, the observations i-1,i+1 and i+1, i+2 (assuming #' a window size of k=2) are used to calculate the mean. #' #' Since it can in case of long NA gaps also occur, that all values next to the #' central value are also NA, the algorithm has a semi-adaptive window size. #' Whenever there are less than 2 non-NA values in the complete window available, #' the window size is incrementally increased, till at least 2 non-NA values are #' there. In all other cases the algorithm sticks to the pre-set window size. #' #' There are options for using Simple Moving Average (SMA), Linear Weighted #' Moving Average (LWMA) and Exponential Weighted Moving Average (EWMA). #' #' SMA: all observations in the window are equally weighted for calculating the mean. #' #' LWMA: weights decrease in arithmetical progression. The observations #' directly next to a central value i, have weight 1/2, the observations #' one further away (i-2,i+2) have weight 1/3, the next (i-3,i+3) have #' weight 1/4, ... #' #' EWMA: uses weighting factors which decrease exponentially. The observations #' directly next to a central value i, have weight 1/2^1, the observations one #' further away (i-2,i+2) have weight 1/2^2, the next (i-3,i+3) have weight 1/2^3, ... #' #' #' @author Steffen Moritz #' #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Example 1: Perform imputation with simple moving average #' na_ma(tsAirgap, weighting = "simple") #' #' # Example 2: Perform imputation with exponential weighted moving average #' na_ma(tsAirgap) #' #' # Example 3: Perform imputation with exponential weighted moving average, window size 6 #' na_ma(tsAirgap, k = 6) #' #' # Example 4: Same as example 1, just written with pipe operator #' tsAirgap %>% na_ma(weighting = "simple") #' @importFrom magrittr %>% #' @export na_ma <- function(x, k = 4, weighting = "exponential", maxgap = Inf) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_ma(data[, i], k, weighting, maxgap), error = function(cond) { warning(paste( "na_ma: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (sum(!missindx) < 2) { stop("At least 2 non-NA data points required in the time series to apply na_ma.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric.") } # 1.6 Check for wrong values of param k if (k < 1) { stop("Parameter k has to be larger than 0.") } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## # Imputation is performed i C++ code na_ma.cpp data <- ma(data, k, weighting) ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/na_mean.R ================================================ #' @title Missing Value Imputation by Mean Value #' #' @description Missing value replacement by mean values. Different means #' like median, mean, mode possible. #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' #' @param option Algorithm to be used. Accepts the following input: #' \itemize{ #' \item{"mean" - take the mean for imputation (default choice)} #' \item{"median" - take the median for imputation} #' \item{"mode" - take the mode for imputation} #' \item{"harmonic" - take the harmonic mean} #' \item{"geometric" - take the geometric mean} #' } #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @details Missing values get replaced by overall mean values. The function #' calculates the mean, median, mode, harmonic or geometric mean over all the non-NA #' values and replaces all NAs with this value. Option 'mode' replaces NAs with #' the most frequent value in the time series. If two or more values occur equally frequent, #' the function imputes the lower value. Due to their calculation formula geometric and harmonic #' mean are not well defined for negative values or zero values in the input series. #' #' In general using the mean for imputation imputation is mostly a suboptimal choice and should #' be handled with great caution. #' #' @author Steffen Moritz #' #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_ma}}, #' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Prerequisite: Create Time series with missing values #' x <- ts(c(2, 3, 4, 5, 6, NA, 7, 8)) #' #' # Example 1: Perform imputation with the overall mean #' na_mean(x) #' #' # Example 2: Perform imputation with overall median #' na_mean(x, option = "median") #' #' # Example 3: Same as example 1, just written with pipe operator #' x %>% na_mean() #' @importFrom magrittr %>% #' @importFrom stats median ts #' @export #' na_mean <- function(x, option = "mean", maxgap = Inf) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_mean(data[, i], option, maxgap), error = function(cond) { warning(paste( "na_mean: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (all(missindx)) { stop("Input data has only NA values. At least 1 non-NA data point required in the time series to apply na_mean.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric.") } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## if (option == "median") { # Use Median median <- stats::median(data, na.rm = TRUE) data[missindx] <- median } else if (option == "mode") { # Calculate Mode temp <- table(as.vector(data)) mode <- names(temp)[temp == max(temp)] mode <- (as.numeric(mode))[1] data[missindx] <- mode } else if (option == "mean") { # Use arithmetic Mean mean <- mean(data, na.rm = TRUE) data[missindx] <- mean } else if (option == "geometric") { # Use geometric Mean # Check preconditions if (any(data == 0 | data < 0, na.rm = T)) { stop( "The input data contains 0 and/or negative values.\n", "The geometric and harmonic mean are not well defined for these cases.\n", "Please another option like e.g. option = 'mean' in this case." ) } mean <- exp(mean(log(data), na.rm = TRUE)) data[missindx] <- mean } else if (option == "harmonic") { # Use harmonic Mean # Check preconditions if (any(data == 0 | data < 0, na.rm = T)) { stop( "The input data contains 0 and/or negative values.\n", "The geometric and harmonic mean are not well defined for these cases.\n", "Please another option like e.g. option = 'mean' in this case." ) } mean <- 1 / mean(1 / data, na.rm = TRUE) data[missindx] <- mean } else { stop("Wrong 'option' parameter given, must be either: \n'mean', 'mode', 'median', 'harmonic' or 'geometric'.") } ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/na_random.R ================================================ #' @title Missing Value Imputation by Random Sample #' #' @description Replaces each missing value by drawing a random sample #' between two given bounds. #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' #' @param lower_bound Lower bound for the random samples. #' If nothing or NULL is set min(x) will be used. #' #' @param upper_bound Upper bound for the random samples. #' If nothing or NULL is set man(x) will be used. #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @details Replaces each missing value by drawing a random sample between two #' given bounds. The default bounds are the minimum and the maximum value in #' the non-NAs from the time series. Function uses \link{runif} function to get #' the random values. #' #' @author Steffen Moritz #' #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Prerequisite: Create Time series with missing values #' x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8)) #' #' # Example 1: Replace all NAs by random values that are between min and max of the input time series #' na_random(x) #' #' # Example 2: Replace all NAs by random values between 1 and 10 #' na_random(x, lower_bound = 1, upper_bound = 10) #' #' # Example 3: Same as example 1, just written with pipe operator #' x %>% na_random() #' @importFrom stats runif ts #' @importFrom magrittr %>% #' @export na_random <- function(x, lower_bound = NULL, upper_bound = NULL, maxgap = Inf) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_random(data[, i], lower_bound, upper_bound, maxgap), error = function(cond) { warning(paste( "na_random: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (sum(!missindx) < 2 && !(!is.null(upper_bound) && !is.null(lower_bound))) { stop("At least 2 non-NA data points required in the time series to apply na_random with the default lower_bound and upper_bound settings.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric # Combined with check if all NA present, since an all NA vector returns FALSE for is.numeric if (!is.numeric(data) & !all(is.na(data))) { stop("Input x is not numeric.") } # 1.6 Check and set values for param lower_bound and upper_bound # If lower or upper bound is NULL, which is the function default usw min/max if (is.null(lower_bound)) { lower_bound <- min(data, na.rm = TRUE) } if (is.null(upper_bound)) { upper_bound <- max(data, na.rm = TRUE) } if (!is.numeric(lower_bound)) { stop("Error for parameter lower_bound: Has to be a numeric value or NULL.") } if (!is.numeric(upper_bound)) { stop("Error for parameter upper_bound: Has to be a numeric value or NULL.") } # For user set upper and lower bounds check if they make sense if (lower_bound >= upper_bound) { stop("Error for parameter lower_bound: lower_bound must be smaller than upper_bound. In case you are using the default settings for these two parameters (which use the min and max of the input series as bounds for the random numbers) appearance of this error message means all values of your time series have the same unique value. In this case try to set the bounds manually.") } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## data[missindx] <- stats::runif(length(data[missindx]), min = lower_bound, max = upper_bound ) ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/na_remove.R ================================================ #' @title Remove Missing Values #' #' @description Removes all missing values from a time series. #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' #' @return Vector (\code{\link{vector}}) #' #' @details Removes all missing values from a input time series. This shortens #' the time series by the number of missing values in the series. Should be #' handled with care, because this can affect the seasonality of the time #' series. Seasonal patterns might be destroyed. Independent from the input, #' this function only returns a vector. (the time information of a resulting #' time series object wouldn't be correct any more). #' #' @author Steffen Moritz #' #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Example 1: Remove all NAs #' # Create Time series with missing values #' x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8)) #' #' # Example 1: Remove all NAs #' na_remove(x) #' #' # Example 2: Remove all NAs in tsAirgap #' na_remove(tsAirgap) #' #' # Example 3: Same as example 1, just written with pipe operator #' x %>% na_remove() #' @importFrom stats ts #' @importFrom magrittr %>% #' @export na_remove <- function(x) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { stop("na_remove only works with univariate input") } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (all(missindx)) { stop("Input data has solely NA values.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## temp <- numeric() for (i in 1:length(data)) { if (!is.na(data[i])) { temp <- c(temp, data[i]) } } ## ## End Imputation Code ## ## ## 3. Post Processing ## # No Post Processing needed for na_remove ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Since all time information of a ts object would be incorrect # after removing values only the vector is returned by the function ## ## End Final Output Formatting ## return(temp) } } ================================================ FILE: R/na_replace.R ================================================ #' @title Replace Missing Values by a Defined Value #' #' @description Replaces all missing values with a given value. #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' #' @param fill Value used to replace the missing values #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @author Steffen Moritz #' #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_random}}, #' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Prerequisite: Create Time series with missing values #' x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8)) #' #' # Example 1: Replace all NAs with 3.5 #' na_replace(x, fill = 3.5) #' #' # Example 2: Replace all NAs with 0 #' na_replace(x, fill = 0) #' #' # Example 3: Same as example 1, just written with pipe operator #' x %>% na_replace(fill = 3.5) #' @importFrom stats ts #' @importFrom magrittr %>% #' @export na_replace <- function(x, fill = 0, maxgap = Inf) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_replace(data[, i], fill, maxgap), error = function(cond) { warning(paste( "na_replace: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values # Not needed for na_replace, it works with all-NA vectors # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric # Combined with check if all NA present, since an all NA vector returns FALSE for is.numeric if (!is.numeric(data) & !all(is.na(data))) { stop("Input x is not numeric.") } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## data[missindx] <- fill ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/na_seadec.R ================================================ #' @title Seasonally Decomposed Missing Value Imputation #' #' @description Removes the seasonal component from the time series, #' performs imputation on the deseasonalized series and afterwards adds #' the seasonal component again. #' #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' @param algorithm Algorithm to be used after decomposition. #' Accepts the following input: #' \itemize{ #' \item{"interpolation" - Imputation by Interpolation} (default choice) #' \item{"locf" - Imputation by Last Observation Carried Forward} #' \item{"mean" - Imputation by Mean Value} #' \item{"random" - Imputation by Random Sample} #' \item{"kalman" - Imputation by Kalman Smoothing and State Space Models} #' \item{"ma" - Imputation by Weighted Moving Average} #' } #' @param find_frequency If TRUE the algorithm will try to estimate the frequency #' of the time-series automatically. #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @param ... Additional parameters for these algorithms that can be passed #' through. Look at \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_random}}, #' \code{\link[imputeTS]{na_mean}} for parameter options. #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @details The algorithm first performs a Seasonal Decomposition of Time Series by Loess #' via \code{\link[stats]{stl}}. Decomposing the time series into seasonal, trend and irregular #' components. The seasonal component gets then removed (subtracted) from the original series. #' As a second step the selected imputation algorithm e.g. na_locf, na_ma, ... is applied #' on the deseasonalized series. Thus, the algorithm can work without being affected by seasonal #' patterns. After filling the NA gaps, the seasonal component is added to the deseasonalized #' series again. #' #' Implementation details: #' A paper about the STL Decomposition procedure is linked in the references. #' Since the function only works with complete data, the initial NA data is temporarily filled #' via linear interpolation in order to perform the decomposition. These temporarily imputed #' values are replaced with NAs again after obtaining the decomposition for the non-NA #' observations. STL decomposition is run with robust = TRUE and s.window = 11. Additionally, #' applying STL decomposition needs a preset frequency. This can be passed by the frequency #' set in the input ts object or by setting 'find_frequency=TRUE' in order to find #' an appropriate frequency for the time series. The find_frequency parameter internally uses #' \code{\link[forecast]{findfrequency}}, which does a spectral analysis of the time series #' for identifying a suitable frequency. Using find_frequency will update the previously set #' frequency of a ts object to the newly found frequency. The default is 'find_frequency = FALSE', #' which gives a warning if no seasonality is set for the supplied time series object. #' If neither seasonality is set nor find_frequency is set to TRUE, the function goes on without #' decomposition and just applies the selected secondary algorithm to the original time series #' that still includes seasonality. #' #' #' @references R. B. Cleveland, W. S. Cleveland, J.E. McRae, and I. #' Terpenning (1990) STL: A Seasonal-Trend Decomposition Procedure #' Based on Loess. Journal of Official Statistics, 6, 3–73. #' #' @author Steffen Moritz #' #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seasplit}} #' #' @examples #' # Example 1: Perform seasonal imputation using algorithm = "interpolation" #' na_seadec(tsAirgap, algorithm = "interpolation") #' #' # Example 2: Perform seasonal imputation using algorithm = "mean" #' na_seadec(tsAirgap, algorithm = "mean") #' #' # Example 3: Same as example 1, just written with pipe operator #' tsAirgap %>% na_seadec(algorithm = "interpolation") #' @importFrom stats frequency stl ts #' @importFrom forecast findfrequency #' @importFrom magrittr %>% #' @export na_seadec <- function(x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ...) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_seadec(data[, i], algorithm, find_frequency, maxgap, ...), error = function(cond) { warning(paste( "na_seadec: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (sum(!missindx) < 3) { stop("At least 3 non-NA data points required in the time series to apply na_seadec.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x.") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric.") } # 1.6 Checks and corrections for time series frequency # Try to findFrequency if (find_frequency == TRUE) { t <- as.vector(data) freq <- forecast::findfrequency(na_interpolation(t)) if (freq > 1) { data <- ts(t, frequency = freq) } else if (freq == 1) { warning("Option find_frequency = TRUE could not detect a seasonal pattern. The algorithm will go on without seasonal decomposition. You might consider manually setting a frequency by creating a time series with frequency information. Here is an example for weekly data: new_ts <- ts(old_ts, frequency = 7)") data <- apply_base_algorithm(data, algorithm = algorithm, ...) return(data) } } if (stats::frequency(data) == 1) { warning("No seasonality information for dataset could be found, going on without decomposition. Setting find_frequency=TRUE might be an option.") data <- apply_base_algorithm(data, algorithm = algorithm, ...) return(data) } if (length(data) < stats::frequency(data) * 2) { warning("More than 2 complete periods needed to perform a seasonal decomposition The algorithm will go on without seasonal decomposition.") data <- apply_base_algorithm(data, algorithm = algorithm, ...) return(data) } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## # Interpolate NAs, to get complete series, because findFRequency and later on stl does not work with NAs temp <- na_interpolation(data) # temp (see above) is a interpolated version of data since stl does not work with NAs stl <- stats::stl(temp, robust = TRUE, s.window = 11) # just take trend component + irregular (remove seasonality) ts_no_seasonality <- stl$time.series[, 2] + stl$time.series[, 3] # Fill in NAs again ts_no_seasonality[missindx] <- NA # Perform imputation on data without seasonality ts_no_seasonalityimputed <- apply_base_algorithm(ts_no_seasonality, algorithm = algorithm, ...) # add seasonality ts_imputed <- ts_no_seasonalityimputed + stl$time.series[, 1] # Merge interpolated values back into original time series data[missindx] <- ts_imputed[missindx] ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/na_seasplit.R ================================================ #' @title Seasonally Splitted Missing Value Imputation #' #' @description Splits the times series into seasons and afterwards performs #' imputation separately for each of the resulting time series datasets #' (each containing the data for one specific season). # #' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object in which missing values shall be replaced #' @param algorithm Algorithm to be used after splits. #' Accepts the following input: #' \itemize{ #' \item{"interpolation" - Imputation by Interpolation} (default choice) #' \item{"locf" - Imputation by Last Observation Carried Forward} #' \item{"mean" - Imputation by Mean Value} #' \item{"random" - Imputation by Random Sample} #' \item{"kalman" - Imputation by Kalman Smoothing and State Space Models} #' \item{"ma" - Imputation by Weighted Moving Average} #' } #' #' @param find_frequency If TRUE the algorithm will try to estimate the frequency #' of the time-series automatically. #' #' @param maxgap Maximum number of successive NAs to still perform imputation on. #' Default setting is to replace all NAs without restrictions. With this #' option set, consecutive NAs runs, that are longer than 'maxgap' will #' be left NA. This option mostly makes sense if you want to #' treat long runs of NA afterwards separately. #' #' @param ... Additional parameters for these algorithms that can be #' passed through. Look at \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_random}}, #' \code{\link[imputeTS]{na_mean}} for parameter options. #' #' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) #' object (dependent on given input at parameter x) #' #' @author Steffen Moritz #' #' @seealso \code{\link[imputeTS]{na_interpolation}}, #' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, #' \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, #' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, #' \code{\link[imputeTS]{na_seadec}} #' #' @examples #' # Example 1: Perform seasonal splitted imputation using algorithm = "interpolation" #' na_seasplit(tsAirgap, algorithm = "interpolation") #' #' # Example 2: Perform seasonal splitted imputation using algorithm = "mean" #' na_seasplit(tsAirgap, algorithm = "mean") #' #' # Example 3: Same as example 1, just written with pipe operator #' tsAirgap %>% na_seasplit(algorithm = "interpolation") #' @importFrom stats frequency ts #' @importFrom magrittr %>% #' @export #' @name na_seasplit na_seasplit <- function(x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ...) { # Variable 'data' is used for all transformations to the time series # 'x' needs to stay unchanged to be able to return the same ts class in the end data <- x #---------------------------------------------------------- # Mulivariate Input # The next 20 lines are just for checking and handling multivariate input. #---------------------------------------------------------- # Check if the input is multivariate if (!is.null(dim(data)[2]) && dim(data)[2] > 1) { # Go through columns and impute them by calling this function with univariate input for (i in 1:dim(data)[2]) { if (!anyNA(data[, i])) { next } # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged tryCatch( data[, i] <- na_seasplit(data[, i], algorithm, find_frequency, maxgap, ...), error = function(cond) { warning(paste( "na_seasplit: No imputation performed for column", i, "of the input dataset. Reason:", cond[1] ), call. = FALSE) } ) } return(data) } #---------------------------------------------------------- # Univariate Input # All relveant imputation / pre- postprocessing code is within this part #---------------------------------------------------------- else { missindx <- is.na(data) ## ## 1. Input Check and Transformation ## # 1.1 Check if NAs are present if (!anyNA(data)) { return(x) } # 1.2 special handling data types if (any(class(data) == "tbl")) { data <- as.vector(as.data.frame(data)[, 1]) } # 1.3 Check for algorithm specific minimum amount of non-NA values if (sum(!missindx) < 3) { stop("At least 3 non-NA data points required in the time series to apply na_seasplit.") } # 1.4 Checks and corrections for wrong data dimension # Check if input dimensionality is not as expected if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) { stop("Wrong input type for parameter x") } # Altering multivariate objects with 1 column (which are essentially # univariate) to be dim = NULL if (!is.null(dim(data)[2])) { data <- data[, 1] } # 1.5 Check if input is numeric if (!is.numeric(data)) { stop("Input x is not numeric") } # 1.6 Checks and corrections for time series frequency # Try to findFrequency if (find_frequency == TRUE) { t <- as.vector(data) freq <- forecast::findfrequency(na_interpolation(t)) if (freq > 1) { data <- ts(t, frequency = freq) } else if (freq == 1) { warning("Parameter find_frequency = TRUE could not detect a seasonal pattern. The algorithm will go on without seasonal decomposition. You might consider manually setting a frequency by creating a time series with frequency information. Here is an example for weekly data: new_ts <- ts(old_ts, frequency = 7)") data <- apply_base_algorithm(data, algorithm = algorithm, ...) return(data) } } if (stats::frequency(data) == 1) { warning("No seasonality information for dataset could be found, going on without decomposition. Setting find_frequency=TRUE might be an option.") data <- apply_base_algorithm(data, algorithm = algorithm, ...) return(data) } if (length(data) < stats::frequency(data) * 2) { warning("More than 2 complete periods needed to perform a seasonal split. The algorithm will go on without seasonal split.") data <- apply_base_algorithm(data, algorithm = algorithm, ...) return(data) } ## ## End Input Check and Transformation ## ## ## 2. Imputation Code ## for (i in 1:stats::frequency(data)) { # get indices for one season indices <- seq(from = i, to = length(data), by = stats::frequency(data)) # Create time series just with one season ts_temp <- stats::ts(data[indices]) # Apply algorithm on this season ts_temp <- apply_base_algorithm(ts_temp, algorithm = algorithm, ...) # Write result back into original time series data[indices] <- as.vector(ts_temp) } ## ## End Imputation Code ## ## ## 3. Post Processing ## # 3.1 Check for Maxgap option # If maxgap = Inf then do nothing and when maxgap is lower than 0 if (is.finite(maxgap) && maxgap >= 0) { # Get logical vector of the time series via is.na() and then get the # run-length encoding of it. The run-length encoding describes how long # the runs of FALSE and TRUE are rlencoding <- rle(is.na(x)) # Runs smaller than maxgap (which shall still be imputed) are set FALSE rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE # The original vector is being reconstructed by reverse.rls, only now the # longer runs are replaced now in the logical vector derived from is.na() # in the beginning all former NAs that are > maxgap are also FALSE en <- inverse.rle(rlencoding) # Set all positions in the imputed series with gaps > maxgap to NA # (info from en vector) data[en == TRUE] <- NA } ## ## End Post Processing ## ## ## 4. Final Output Formatting ## # Give back the object originally supplied to the function # (necessary for multivariate input with only 1 column) if (!is.null(dim(x)[2])) { x[, 1] <- data return(x) } ## ## End Final Output Formatting ## return(data) } } ================================================ FILE: R/statsNA.R ================================================ #' @title Print Statistics about Missing Values #' #' @description Print summary stats about the distribution of #' missing values in a univariate time series. #' #' @param x Numeric Vector (\code{\link{vector}}) or #' Time Series (\code{\link{ts}}) object containing NAs #' #' @param bins Split number for bin stats. Number of bins the time series gets #' divided into. For each bin information about amount/percentage of missing #' values is printed. Default value is 4 - what means stats about the #' 1st,2nd,3rd,4th quarter of the time series are shown. #' #' @param print_only Choose if the function Prints or Returns. #' For print_only = TRUE the function has no return value and just prints out #' missing value stats. If print_only is changed to FALSE, nothing is printed #' and the function returns a list.Print gives a little bit more information, #' since the returned list does not include "Stats for Bins" #' and "overview NA series" #' #' @return A \code{\link{list}} containing the stats. Beware: Function gives #' only a return value if print_only = FALSE. #' #' @details Prints the following information about the missing values in the time series: #' \itemize{ #' \item{"Length of time series" - Number of observations in the time series (including NAs)} #' \item{"Number of Missing Values" - Number of missing values in the time series} #' \item{"Percentage of Missing Values" - Percentage of missing values in the time series} #' \item{"Number of Gaps" - Number of NA gaps (consisting of one or more consecutive NAs) in the time series} #' \item{"Average Gap Size" - Average size of consecutive NAs for the NA gaps in the time series} #' \item{"Stats for Bins" - Number/percentage of missing values for the split into bins } #' \item{"Longest NA gap" - Longest series of consecutive missing values (NAs in a row) in the time series } #' \item{"Most frequent gap size" - Most frequent occurring series of missing values in the time series} #' \item{"Gap size accounting for most NAs" - The series of consecutive missing values that accounts for most missing values overall in the time series} #' \item{"Overview NA series" - Overview about how often each series of consecutive missing values occurs. Series occurring 0 times are skipped} #' } #' It is furthermore, important to note, that you are able to choose whether #' the function returns a list or prints the information only. #' (see description of parameter "print_only") #' #' @author Steffen Moritz #' @seealso \code{\link[imputeTS]{ggplot_na_distribution}}, #' \code{\link[imputeTS]{ggplot_na_distribution2}}, #' \code{\link[imputeTS]{ggplot_na_gapsize}} #' #' @examples #' # Example 1: Print stats about the missing data in tsNH4 #' statsNA(tsNH4) #' #' # Example 2: Return list with stats about the missing data in tsAirgap #' statsNA(tsAirgap, print_only = FALSE) #' #' # Example 3: Same as example 1, just written with pipe operator #' tsNH4 %>% statsNA() #' @importFrom magrittr %>% #' @export statsNA <- function(x, bins = 4, print_only = TRUE) { data <- x ## ## Input check ## if (!is.null(dim(data)) && dim(data)[2] != 1) { stop("Input x is not univariate") } if (!is.numeric(data)) { stop("Input x is not numeric") } ## ## Analysis Code ## missindx <- is.na(data) ## Count NAs number_NAs <- length(data[missindx]) ## Calculate Percentage pct_NAs <- number_NAs / length(data) * 100 ## NA in Bins // bins_df is result data frame # Create DF to store information for each bin bins_df <- data.frame( start = numeric(bins), end = numeric(bins), num = numeric(bins), num_NA = numeric(bins), num_nonNA = numeric(bins), pct_NA = numeric(bins) ) length_bin <- ceiling(length(data) / bins) temp <- 0 for (i in 1:bins) { bins_df$start[i] <- temp + 1 temp <- temp + length_bin bins_df$end[i] <- temp } bins_df$end[bins] <- length(data) for (i in 1:bins) { bins_df$num[i] <- bins_df$end[i] - bins_df$start[i] + 1 temp_data <- data[bins_df$start[i]:bins_df$end[i]] bins_df$num_NA[i] <- length(temp_data[is.na(temp_data)]) bins_df$num_nonNA[i] <- bins_df$num[i] - bins_df$num_NA[i] bins_df$pct_NA[i] <- bins_df$num_NA[i] / bins_df$num[i] * 100 } ## Consecutive NAs // vec is result vector vec <- rep(0, length(data)) run <- 0 for (i in 0:(length(data) - 1)) { if (is.na(data[i + 1])) { run <- run + 1 if (i == (length(data) - 1)) { vec[run] <- vec[run] + 1 } } else { vec[run] <- vec[run] + 1 run <- 0 } } # Most Common Consecutive NA max <- 0 indx <- 0 for (i in 1:length(vec)) { if (vec[i] >= max) { max <- vec[i] indx <- i } } common_NA <- indx common_NAnum <- max # Longest Consecutive NA longest_NA <- NA for (i in length(vec):1) { if (vec[i] > 0) { longest_NA <- i break } } # Biggest Weight Consecutive NA max <- 0 indx <- 0 for (i in 1:length(vec)) { if (vec[i] * i >= max) { max <- vec[i] * i indx <- i } } wcommon_NA <- indx wcommon_NAnum <- max # Average NA gapsize number_gaps <- sum(vec) if (number_NAs > 0) { average_gapsize <- number_NAs / number_gaps } else { average_gapsize <- 0 } #### Print everything if (print_only == TRUE) { ## Print Number NA and Pct NA print("Length of time series:") print(length(data)) print("-------------------------") print("Number of Missing Values:") print(number_NAs) print("-------------------------") print("Percentage of Missing Values:") print(paste0(format(pct_NAs, digits = 3), "%")) print("-------------------------") print("Number of Gaps:") print(number_gaps) print("-------------------------") print("Average Gap Size:") print(average_gapsize) print("-------------------------") # Exit if no NAs if (number_NAs < 1) { print("No NAs in the time series.") print("-------------------------") return("There are no NAs in the time series") } ## Print bin stats print("Stats for Bins") for (i in 1:length(bins_df$num)) { print(paste0( " Bin ", i, " (", bins_df$num[i], " values from ", bins_df$start[i], " to ", bins_df$end[i], ") : ", bins_df$num_NA[i], " NAs (", format(bins_df$pct_NA[i], digits = 3), "%)" )) } print("-------------------------") ## Print Consecutive NAs print("Longest NA gap (series of consecutive NAs)") print(paste(longest_NA, "in a row")) print("-------------------------") print("Most frequent gap size (series of consecutive NA series)") print(paste0(common_NA, " NA in a row (occurring ", common_NAnum, " times)")) print("-------------------------") print("Gap size accounting for most NAs") print(paste0(wcommon_NA, " NA in a row (occurring ", wcommon_NAnum / wcommon_NA, " times, making up for overall ", wcommon_NAnum, " NAs)")) print("-------------------------") print("Overview NA series") for (i in 1:length(vec)) { if (vec[i] > 0) { print(paste0(" ", i, " NA in a row: ", vec[i], " times")) } } } if (print_only == FALSE) { output <- list( length_series = length(data), number_NAs = number_NAs, number_na_gaps = number_gaps, average_size_na_gaps = average_gapsize, percentage_NAs = paste0(format(pct_NAs, digits = 3), "%"), longest_na_gap = longest_NA, most_frequent_na_gap = common_NA, most_weighty_na_gap = wcommon_NA, df_distribution_na_gaps = vec ) return(output) } } ================================================ FILE: R/tsAirgap.R ================================================ #' @title Time series of monthly airline passengers (with NAs) #' #' @description Monthly totals of international airline passengers, 1949 to 1960. #' This time series contains missing values. In the package included is also the \code{\link{tsAirgapComplete}} time series providing the true values for the #' missing values. #' #' @details The dataset originates from Box and Jenkins (see citation) and is a commonly used example in #' time series analysis literature. #' #' It characteristics (strong trend, strong seasonal behavior) make it also a great #' example for time series imputation. #' Thus the version with inserted NA gaps was created under the name tsAirgap. #' #' In order to use this series for comparing imputation algorithm results, #' there are two time series provided. One series without missing values, which can #' be used as ground truth. Another series with NAs, on which the imputation #' algorithms can be applied. #' #' There are the two time series: #' \itemize{ #' \item tsAirgap - The time series with NAs. #' #' \item tsAirgapComplete - Time series without NAs. #' } #' @docType data #' @keywords datasets #' @seealso \code{\link[imputeTS]{tsHeating}}, \code{\link[imputeTS]{tsNH4}} #' @name tsAirgap #' @usage tsAirgap #' #' #' @source \cite{Box, G. E. P., Jenkins, G. M., Reinsel, G. C. and Ljung, G. M. (2015). Time series analysis: forecasting and control. Fifth Edition. John Wiley and Sons.} #' @format Time Series (\code{\link{ts}}) with 144 rows including 13 NAs. NULL ================================================ FILE: R/tsAirgapComplete.R ================================================ #' @title Time series of monthly airline passengers (complete) #' #' @description Monthly totals of international airline passengers, 1949 to 1960. #' This time series provides the truth for the missing values of the \code{\link{tsAirgap}} time series. Thus it is identical #' to the tsAirgap time series except that no value is missing. #' #' @details The dataset originates from Box and Jenkins (see citation) and is a commonly used example in #' time series analysis literature. #' #' It characteristics (strong trend, strong seasonal behavior) make it also a great #' example for time series imputation. #' Thus the version with inserted NA gaps was created under the name tsAirgap. #' #' In order to use this series for comparing imputation algorithm results, #' there are two time series provided. One series without missing values, which can #' be used as ground truth. Another series with NAs, on which the imputation #' algorithms can be applied. #' #' There are the two time series: #' \itemize{ #' \item tsAirgap - The time series with NAs. #' #' \item tsAirgapComplete - Time series without NAs. #' } #' @docType data #' @keywords datasets #' @seealso \code{\link[imputeTS]{tsHeating}}, \code{\link[imputeTS]{tsNH4}} #' @name tsAirgapComplete #' @usage tsAirgapComplete #' @source \cite{Box, G. E. P., Jenkins, G. M., Reinsel, G. C. and Ljung, G. M. (2015). Time series analysis: forecasting and control. Fifth Edition. John Wiley and Sons.} #' @format Time Series (\code{\link{ts}}) with 144 rows. NULL ================================================ FILE: R/tsHeating.R ================================================ #' @title Time series of a heating systems supply temperature (with NAs) #' #' @description Time series of a heating systems supply temperature. Measured from 18.11.2013 - 05:12:00 to 13.01.2015 - 15:08:00 in 1 minute steps. #' This time series contains missing values. In the package included is also the \code{\link{tsHeatingComplete}} time series providing the true values for the #' missing values. #' #' @details The time series originates from the GECCO Industrial Challenge 2015. #' This Challenge was about "Recovering missing information in heating system operating data". #' Goal was to impute missing values in heating system sensor data as accurate as possible. #' (\doi{10.5281/zenodo.3884899}) #' #' In order to use this series for comparing imputation algorithm results, #' there are two time series provided. One series without missing values, which can #' be used as ground truth. Another series with NAs, on which the imputation #' algorithms can be applied. The NAs thereby were inserted according to patterns #' found in similar time series. #' #' There are the two time series: #' \itemize{ #' \item tsHeating - The time series with NAs. #' #' \item tsHeatingComplete - Time series without NAs. #' } #' @docType data #' @keywords datasets #' @seealso \code{\link[imputeTS]{tsAirgap}}, \code{\link[imputeTS]{tsNH4}} #' @name tsHeating #' @usage tsHeating #' #' @source \cite{Moritz, Steffen, Friese, Martina, Fischbach, Andreas, Schlitt, #' Christopher, and Bartz-Beielstein, Thomas. (2015, May 1). #' GECCO Industrial Challenge 2015 Dataset: A heating system dataset for the 'Recovering #' missing information in heating system operating data' competition at the #' Genetic and Evolutionary Computation Conference 2015, Madrid, Spain. #' http://doi.org/10.5281/zenodo.3884899 } #' #' @format Time Series (\code{\link{ts}}) with 606837 rows including 57391 NAs. NULL ================================================ FILE: R/tsHeatingComplete.R ================================================ #' @title Time series of a heating systems supply temperature (complete) #' #' @description Time series of a heating systems supply temperature. Measured from 18.11.2013 - 05:12:00 to 13.01.2015 - 15:08:00 in 1 minute steps. #' This time series provides the truth for the missing values of the \code{\link{tsHeating}} time series. Thus it is identical #' to the heating time series except that no value is missing. #' #' @details The time series originates from the GECCO Industrial Challenge 2015. #' This Challenge was about "Recovering missing information in heating system operating data". #' Goal was to impute missing values in heating system sensor data as accurate as possible. #' (\doi{10.5281/zenodo.3884899}) #' #' In order to use this series for comparing imputation algorithm results, #' there are two time series provided. One series without missing values, which can #' be used as ground truth. Another series with NAs, on which the imputation #' algorithms can be applied. The NAs thereby were inserted according to patterns #' found in similar time series. #' #' There are the two time series: #' \itemize{ #' \item tsHeating - The time series with NAs. #' #' \item tsHeatingComplete - Time series without NAs. #' } #' @docType data #' @keywords datasets #' @seealso \code{\link[imputeTS]{tsAirgap}}, \code{\link[imputeTS]{tsNH4}} #' @name tsHeatingComplete #' @usage tsHeatingComplete #' #' @source \cite{Moritz, Steffen, Friese, Martina, Fischbach, Andreas, Schlitt, #' Christopher, and Bartz-Beielstein, Thomas. (2015, May 1). #' GECCO Industrial Challenge 2015 Dataset: A heating system dataset for the 'Recovering #' missing information in heating system operating data' competition at the #' Genetic and Evolutionary Computation Conference 2015, Madrid, Spain. #' http://doi.org/10.5281/zenodo.3884899 } #' #' @format Time Series (\code{\link{ts}}) with 606837 rows. NULL ================================================ FILE: R/tsNH4.R ================================================ #' @title Time series of NH4 concentration in a wastewater system (with NAs) #' #' @description Time series of NH4 concentration in a wastewater system. Measured from 30.11.2010 - 16:10 to 01.01.2011 - 6:40 in 10 minute steps. #' This time series contains missing values. In the package included is also the \code{\link{tsNH4Complete}} time series providing the true values for the #' missing values. #' #' @details The time series is derived from the dataset of the GECCO Industrial Challenge 2014. #' #' In order to use this series for comparing imputation algorithm results, #' there are two time series provided. One series without missing values, which can #' be used as ground truth. Another series with NAs, on which the imputation #' algorithms can be applied. The NAs thereby were inserted according to patterns #' found in similar time series. #' #' There are the two time series: #' \itemize{ #' \item tsNH4 - The time series with NAs. #' #' \item tsNH4Complete - Time series without NAs. #' } #' @docType data #' @keywords datasets #' @seealso \code{\link[imputeTS]{tsAirgap}},\code{\link[imputeTS]{tsHeating}} #' @name tsNH4 #' @usage tsNH4 #' #' @source \cite{Friese, Martina, Fischbach, Andreas, Flasch, Oliver, Mersmann, Olaf, #' Bartz-Beielstein, Thomas, and Walbeck, Klaus. (2014, July 16). #' GECCO Industrial Challenge 2014 Dataset: A water quality dataset for the #' 'Active protection against pollution of the surface water' competition at the #' Genetic and Evolutionary Computation Conference 2015, Vancouver, Canada. #' http://www.spotseven.de/gecco-challenge/gecco-challenge-2014} #' #' @format Time Series (\code{\link{ts}}) with 4552 rows including 883 NAs. NULL ================================================ FILE: R/tsNH4Complete.R ================================================ #' @title Time series of NH4 concentration in a wastewater system (complete) #' #' @description Time series of NH4 concentration in a wastewater system. Measured from 30.11.2010 - 16:10 to 01.01.2011 - 6:40 in 10 minute steps. #' This time series provides the truth for the missing values of the \code{\link{tsNH4}} time series. Thus it is identical #' to the heating time series except that no value is missing. #' @details The time series is derived from the dataset of the GECCO Industrial Challenge 2014. #' #' In order to use this series for comparing imputation algorithm results, #' there are two time series provided. One series without missing values, which can #' be used as ground truth. Another series with NAs, on which the imputation #' algorithms can be applied. The NAs thereby were inserted according to patterns #' found in similar time series. #' #' There are the two time series: #' \itemize{ #' \item tsNH4 - The time series with NAs. #' #' \item tsNH4Complete - Time series without NAs. #' } #' @docType data #' @keywords datasets #' @seealso \code{\link[imputeTS]{tsAirgap}},\code{\link[imputeTS]{tsHeating}} #' @name tsNH4Complete #' @usage tsNH4Complete #' #' @source \cite{Friese, Martina, Fischbach, Andreas, Flasch, Oliver, Mersmann, Olaf, #' Bartz-Beielstein, Thomas, and Walbeck, Klaus. (2014, July 16). #' GECCO Industrial Challenge 2014 Dataset: A water quality dataset for the #' 'Active protection against pollution of the surface water' competition at the #' Genetic and Evolutionary Computation Conference 2015, Vancouver, Canada. #' http://www.spotseven.de/gecco-challenge/gecco-challenge-2014}#' #' #' @format Time Series (\code{\link{ts}}) with 4552 rows. NULL ================================================ FILE: README.md ================================================ [![Project Status: Active The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![R-CMD-check](https://github.com/SteffenMoritz/imputeTS/workflows/R-CMD-check/badge.svg)](https://github.com/SteffenMoritz/imputeTS/actions) [![Codecov test coverage](https://codecov.io/gh/SteffenMoritz/imputeTS/branch/master/graph/badge.svg)](https://app.codecov.io/gh/SteffenMoritz/imputeTS?branch=master) [![CRAN Version](https://www.r-pkg.org/badges/version/imputeTS)](https://cran.r-project.org/package=imputeTS) [![CRAN Release](https://www.r-pkg.org/badges/last-release/imputeTS )](https://cran.r-project.org/package=imputeTS) [![CRAN Downloads](https://cranlogs.r-pkg.org/badges/imputeTS)](https://cran.r-project.org/package=imputeTS) # imputeTS: Time Series Missing Value Imputation imputeTS Logo The imputeTS package specializes on (univariate) time series imputation. It offers several different imputation algorithm implementations. Beyond the imputation algorithms the package also provides plotting and printing functions of time series missing data statistics. Additionally three time series datasets for imputation experiments are included. ## Installation The imputeTS package can be found on [CRAN]. For installation execute in R: ``` install.packages("imputeTS") ``` If you want to install the latest version from GitHub (can be unstable) run: ``` library(devtools) install_github("SteffenMoritz/imputeTS") ``` ## Usage * #### Imputation To impute (fill all missing values) in a time series ***x***, run the following command: ``` na_interpolation(x) ``` Output is the time series ***x*** with all NA's replaced by reasonable values. > This is just one example for an imputation algorithm. > In this case interpolation was the algorithm of choice for > calculating the NA replacements. There are several other > algorithms (see also under caption [Imputation Algorithms](#imputation-algorithms)). All > imputation functions are named alike starting with > na_ followed by a algorithm label e.g. na_mean, na_kalman, ... * #### Plotting To plot missing data statistics for a time series ***x***, run the following command: ``` ggplot_na_distribution(x) ```  

Example ggplot_na_distribution plot

> This is just one exemplary plot. Overall there are five different types > of missing data plots (see also under caption [Missing Data Plots](#missing-data-plots)). > There is an additional tutorial just about plots available - the [Gallery of Visualizations]. * #### Printing To print descriptive statistics about the missing data in a time series ***x***, run the following command: ``` statsNA(x) ``` * #### Example Datasets To load the *'heating'* time series (with missing values) into a variable ***y*** and the *'heating'* time series (without missing values) into a variable ***z***, run: ``` y <- tsHeating z <- tsHeatingComplete ``` > There are three datasets provided with the package, the *'tsHeating'*, the > *'tsAirgap'* and the *'tsNH4'* time series (see also under caption [Datasets](#datasets)). ## Imputation Algorithms {#imputation-algorithms} Here is a table with available algorithms to choose from: | Function | Description | | :--------------------|:-----------------------------------------------------------| | na_interpolation |Missing Value Imputation by Interpolation | | na_kalman |Missing Value Imputation by Kalman Smoothing | | na_locf |Missing Value Imputation by Last Observation Carried Forward| | na_ma |Missing Value Imputation by Weighted Moving Average | | na_mean |Missing Value Imputation by Mean Value | | na_random |Missing Value Imputation by Random Sample | | na_remove |Remove Missing Values | | na_replace |Replace Missing Values by a Defined Value | | na_seadec |Seasonally Decomposed Missing Value Imputation | | na_seasplit |Seasonally Splitted Missing Value Imputation | > This is a rather broad overview. The functions itself mostly offer more > than just one algorithm. For example na_interpolation can be set to linear > or spline interpolation. More detailed information about the algorithms and their options can be found in the [imputeTS reference manual]. ## Missing Data Plots {#missing-data-plots} Here is a table with available plots to choose from: | Function | Description | | :-----------------------|:-------------------------------------------------------------| | ggplot_na_distribution |Visualize Distribution of Missing Values | | ggplot_na_distribution2 |Missing Values Summarized in Time Intervals | | ggplot_na_gapsize |Visualize Distribution of NA Gapsizes | | ggplot_na_gapsize2 |Visualize Total NAs of Different NA Gapsizes | | ggplot_na_imputations |Visualize Imputed Values | More detailed information about the plots can be found in the [imputeTS reference manual] and in the [Gallery of Visualizations]. ### Datasets {#datasets} There are three datasets (each in two versions) available: | Dataset | Description | | :----------------|:-----------------------------------------------------------------| | tsAirgap |Time series of monthly airline passengers (with NAs) | | tsAirgapComplete |Time series of monthly airline passengers (complete) | | tsHeating |Time series of a heating systems supply temperature (with NAs) | | tsHeatingComplete|Time series of a heating systems supply temperature (complete) | | tsNH4 |Time series of NH4 concentration in a wastewater system (with NAs)| | tsNH4Complete |Time series of NH4 concentration in a wastewater system (complete)| > The tsAirgap, tsHeating and tsNH4 time series are with NAs. Their **complete** versions are > without NAs. Except the missing values their versions are identical. > The NAs for the time series were artifically inserted by simulating the missing > data pattern observed in similar non-complete time series from the same domain. > Having a complete and incomplete version of the same dataset is useful for > conducting experiments of imputation functions. More detailed information about the datasets can be found in the [imputeTS reference manual]. ## Reference You can cite imputeTS the following: > Moritz, Steffen, and Bartz-Beielstein, Thomas. "imputeTS: Time Series Missing Value Imputation in R." R Journal 9.1 (2017). doi: 10.32614/RJ-2017-009. ## Need Help? If you have general programming problems or need help using the package please ask your question on [StackOverflow]. By doing so all users will be able to benefit in the future from your question. > Don't forget to mark your question with the [imputets] tag on StackOverflow to get me notified ### Support If you found a bug or have suggestions, feel free to get in contact via steffen.moritz10 at gmail.com. > All feedback is welcome ### Version **3.4** ### License GPL-3 [CRAN]: [imputeTS reference manual]: [Citation]: [StackOverflow]: [imputets]: [Gallery of Visualizations]: ================================================ FILE: _pkgdown.yaml ================================================ title: imputeTS url: https://SteffenMoritz.github.io/imputeTS template: params: bootswatch: flatly authors: Steffen Moritz: href: https://github.com/SteffenMoritz ================================================ FILE: codecov.yml ================================================ comment: false coverage: status: project: default: target: auto threshold: 1% informational: true patch: default: target: auto threshold: 1% informational: true ================================================ FILE: docs/404.html ================================================ Page not found (404) • imputeTS
Content not found. Please use links in the navbar.

Site built with pkgdown 2.1.3.

================================================ FILE: docs/articles/gallery_visualizations.html ================================================ Gallery: Times Series Missing Data Visualizations • imputeTS

There are multiple different plots for (univarate) time series missing data available in the imputeTS package. These can be grouped in the following three categories:

  • Getting an Overview (ggplot_na_distribution)
  • Deeper insights and missing data patterns (ggplot_na_distribution2, ggplot_na_gapsize)
  • Assessing/Exploring imputation results (ggplot_na_imputations)

This vignette showcases all of the available visualizations in the imputeTS package. More information on time series imputation and the imputeTS package in general can be found in this paper: imputeTS: Time Series Missing Value Imputation in R.

Getting a first overview (ggplot_na_distribution)

The best starting point for getting an overview about the missing data in your (univariate) time series is the ggplot_na_distribution() plot. It gives a nice first overview where in the time series the missing values occur and how they are distributed. It also already gives a rough impression on how many missing data are in different intervals of the time series.

Usage is easy: just supply the (univariate) time series to the function call. Only the time series is needed as input - all additional parameters are only needed to alter the appearance of the plot.

It is important to note, that the input itself needs to be univariate. For data types with multiple variables/columns only use the column you want to plot as input parameter x. The x-axis time information can be added with the x_axis_labels parameter - otherwise the consecutive index of observations in the series is used as x-axis tick label.

Thus for a data.frame df with multiple columns df$date, df$value, df$another_value, df$yet_another_value where we want to plot df$value with Dates on the x-axis the required function call would look like this:

ggplot_na_distribution(x = df$value, x_axis_labels = df$date)

Detailed information about certain intervals (ggplot_na_distribution2)

When a summary for certain time intervals (e. g. weeks) is needed, the ggplot_na_distribution2() plot is useful. It shows the missing data percentage for each interval as a bar. This kind of summary plot is also quite useful for very long time series, which would not fit into the plot window as a lineplot.

Like for ggplot_na_distribution() only parameter x (the univariate time series) is mandatory for creating a plot with ggplot_na_distribution2(). With the parameter interval_size the size of the interval can be changed (default is a auto calculated interval size that gives a good overall overview). All other parameters are mostly needed for changing the appearance of the plot.

Alternatively the missing data count for the interval (instead of the percentages) can be shown. Below is an example with a custom interval size of 144 and a custom color for the missing data bars. Since the example data is recorded in 10 minute time steps, a interval_size of 144 means that we are using daily intervals (6 measurements per hour, 24 hours per day, 6*24 = 144).

ggplot_na_distribution2(tsNH4, measure = "count", interval_size = 144, color_missing = "gold3")

Insights about missing data patterns (ggplot_na_gapsize)

Often deeper insights about the missing data are quite useful. These insights can give hints of possible causes of the missing data and an indication, which imputation algorithms might give good results. The plot gives an overview about how often different gapsizes (NAs in a row) occur in the time series.

Only the parameter x (the univariate time series) is needed as mandatory input. By default the plot shows only the 10 most often occurring gapsizes. Use parameter limit to increase this number.

The plot shows both, the number of occurrence and the resulting NAs for the respective gapsizes. Resulting NAs can be explained as the number of NAs a certain gapsize accounts for in total. For example a gapsize of 3 that occurs 5 times results in 15 NAs overall. The parameter include_total can be used to change this behavior. Below is a example of the same plot with specific settings for limit and include_total.

library(imputeTS)
ggplot_na_gapsize(tsNH4, include_total = F, limit = 15)

Evaluate imputation results (ggplot_na_imputations)

After using imputation functions like na_kalman(), na_interpolation(), na_seadec() there is often the need to get a first impression on how good the algorithm performs. The ggplot_na_imputations() plot gives a good impression on how well the imputed values fit into the original time series.

Mandatory inputs for this function are these two parameters: x_with_na (the time series as it was before imputation) and x_with_imputations (the time series without NAs after imputation).

library(imputeTS)
imp <- na_interpolation(tsAirgap)
ggplot_na_imputations(tsAirgap, imp)

In some cases (mostly when performing imputation experiments and benchmarks) the NAs were only artificially introduced into the original time series. Which means, there exists a ground truth for the NA values (the complete time series before introducing the NAs). In this case you can additionally use the x_with_truth parameter to get a plot that displays both, the imputations and the ground truth.

library(imputeTS)
imp <- na_mean(tsAirgap)
ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp, x_with_truth = tsAirgapComplete )

Support

If you found a bug or have suggestions, feel free to open an issue on GitHub or get in contact via steffen.moritz10 at gmail.com.

All feedback is welcome

Site built with pkgdown 2.1.3.

================================================ FILE: docs/articles/gallery_visualizations_files/accessible-code-block-0.0.1/empty-anchor.js ================================================ // Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> // v0.0.1 // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. document.addEventListener('DOMContentLoaded', function() { const codeList = document.getElementsByClassName("sourceCode"); for (var i = 0; i < codeList.length; i++) { var linkList = codeList[i].getElementsByTagName('a'); for (var j = 0; j < linkList.length; j++) { if (linkList[j].innerHTML === "") { linkList[j].setAttribute('aria-hidden', 'true'); } } } }); ================================================ FILE: docs/articles/gallery_visualizations_files/header-attrs-2.16/header-attrs.js ================================================ // Pandoc 2.9 adds attributes on both header and div. We remove the former (to // be compatible with the behavior of Pandoc < 2.8). document.addEventListener('DOMContentLoaded', function(e) { var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); var i, h, a; for (i = 0; i < hs.length; i++) { h = hs[i]; if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 a = h.attributes; while (a.length > 0) h.removeAttribute(a[0].name); } }); ================================================ FILE: docs/articles/gallery_visualizations_files/header-attrs-2.7/header-attrs.js ================================================ // Pandoc 2.9 adds attributes on both header and div. We remove the former (to // be compatible with the behavior of Pandoc < 2.8). document.addEventListener('DOMContentLoaded', function(e) { var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); var i, h, a; for (i = 0; i < hs.length; i++) { h = hs[i]; if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 a = h.attributes; while (a.length > 0) h.removeAttribute(a[0].name); } }); ================================================ FILE: docs/articles/index.html ================================================ Articles • imputeTS

Site built with pkgdown 2.1.3.

================================================ FILE: docs/authors.html ================================================ Authors and Citation • imputeTS
  • Steffen Moritz. Author, maintainer, copyright holder.

  • Sebastian Gatscha. Author.

  • Earo Wang. Contributor.

  • Ron Hause. Contributor.

Citation

Source: inst/CITATION

Steffen Moritz, Thomas Bartz-Beielstein (2017). “imputeTS: Time Series Missing Value Imputation in R.” The R Journal, 9(1), 207–218. doi:10.32614/RJ-2017-009.

@Article{,
  title = {{imputeTS: Time Series Missing Value Imputation in R}},
  author = {{Steffen Moritz} and {Thomas Bartz-Beielstein}},
  journal = {{The R Journal}},
  volume = {9},
  number = {1},
  pages = {207--218},
  year = {2017},
  doi = {10.32614/RJ-2017-009},
}

Site built with pkgdown 2.1.3.

================================================ FILE: docs/bootstrap-toc.css ================================================ /*! * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) * Copyright 2015 Aidan Feldman * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ /* All levels of nav */ nav[data-toggle='toc'] .nav > li > a { display: block; padding: 4px 20px; font-size: 13px; font-weight: 500; color: #767676; } nav[data-toggle='toc'] .nav > li > a:hover, nav[data-toggle='toc'] .nav > li > a:focus { padding-left: 19px; color: #563d7c; text-decoration: none; background-color: transparent; border-left: 1px solid #563d7c; } nav[data-toggle='toc'] .nav > .active > a, nav[data-toggle='toc'] .nav > .active:hover > a, nav[data-toggle='toc'] .nav > .active:focus > a { padding-left: 18px; font-weight: bold; color: #563d7c; background-color: transparent; border-left: 2px solid #563d7c; } /* Nav: second level (shown on .active) */ nav[data-toggle='toc'] .nav .nav { display: none; /* Hide by default, but at >768px, show it */ padding-bottom: 10px; } nav[data-toggle='toc'] .nav .nav > li > a { padding-top: 1px; padding-bottom: 1px; padding-left: 30px; font-size: 12px; font-weight: normal; } nav[data-toggle='toc'] .nav .nav > li > a:hover, nav[data-toggle='toc'] .nav .nav > li > a:focus { padding-left: 29px; } nav[data-toggle='toc'] .nav .nav > .active > a, nav[data-toggle='toc'] .nav .nav > .active:hover > a, nav[data-toggle='toc'] .nav .nav > .active:focus > a { padding-left: 28px; font-weight: 500; } /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ nav[data-toggle='toc'] .nav > .active > ul { display: block; } ================================================ FILE: docs/bootstrap-toc.js ================================================ /*! * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) * Copyright 2015 Aidan Feldman * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ (function() { 'use strict'; window.Toc = { helpers: { // return all matching elements in the set, or their descendants findOrFilter: function($el, selector) { // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ // http://stackoverflow.com/a/12731439/358804 var $descendants = $el.find(selector); return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); }, generateUniqueIdBase: function(el) { var text = $(el).text(); var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); return anchor || el.tagName.toLowerCase(); }, generateUniqueId: function(el) { var anchorBase = this.generateUniqueIdBase(el); for (var i = 0; ; i++) { var anchor = anchorBase; if (i > 0) { // add suffix anchor += '-' + i; } // check if ID already exists if (!document.getElementById(anchor)) { return anchor; } } }, generateAnchor: function(el) { if (el.id) { return el.id; } else { var anchor = this.generateUniqueId(el); el.id = anchor; return anchor; } }, createNavList: function() { return $(''); }, createChildNavList: function($parent) { var $childList = this.createNavList(); $parent.append($childList); return $childList; }, generateNavEl: function(anchor, text) { var $a = $(''); $a.attr('href', '#' + anchor); $a.text(text); var $li = $('
  • '); $li.append($a); return $li; }, generateNavItem: function(headingEl) { var anchor = this.generateAnchor(headingEl); var $heading = $(headingEl); var text = $heading.data('toc-text') || $heading.text(); return this.generateNavEl(anchor, text); }, // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). getTopLevel: function($scope) { for (var i = 1; i <= 6; i++) { var $headings = this.findOrFilter($scope, 'h' + i); if ($headings.length > 1) { return i; } } return 1; }, // returns the elements for the top level, and the next below it getHeadings: function($scope, topLevel) { var topSelector = 'h' + topLevel; var secondaryLevel = topLevel + 1; var secondarySelector = 'h' + secondaryLevel; return this.findOrFilter($scope, topSelector + ',' + secondarySelector); }, getNavLevel: function(el) { return parseInt(el.tagName.charAt(1), 10); }, populateNav: function($topContext, topLevel, $headings) { var $context = $topContext; var $prevNav; var helpers = this; $headings.each(function(i, el) { var $newNav = helpers.generateNavItem(el); var navLevel = helpers.getNavLevel(el); // determine the proper $context if (navLevel === topLevel) { // use top level $context = $topContext; } else if ($prevNav && $context === $topContext) { // create a new level of the tree and switch to it $context = helpers.createChildNavList($prevNav); } // else use the current $context $context.append($newNav); $prevNav = $newNav; }); }, parseOps: function(arg) { var opts; if (arg.jquery) { opts = { $nav: arg }; } else { opts = arg; } opts.$scope = opts.$scope || $(document.body); return opts; } }, // accepts a jQuery object, or an options object init: function(opts) { opts = this.helpers.parseOps(opts); // ensure that the data attribute is in place for styling opts.$nav.attr('data-toggle', 'toc'); var $topContext = this.helpers.createChildNavList(opts.$nav); var topLevel = this.helpers.getTopLevel(opts.$scope); var $headings = this.helpers.getHeadings(opts.$scope, topLevel); this.helpers.populateNav($topContext, topLevel, $headings); } }; $(function() { $('nav[data-toggle="toc"]').each(function(i, el) { var $nav = $(el); Toc.init($nav); }); }); })(); ================================================ FILE: docs/docsearch.css ================================================ /* Docsearch -------------------------------------------------------------- */ /* Source: https://github.com/algolia/docsearch/ License: MIT */ .algolia-autocomplete { display: block; -webkit-box-flex: 1; -ms-flex: 1; flex: 1 } .algolia-autocomplete .ds-dropdown-menu { width: 100%; min-width: none; max-width: none; padding: .75rem 0; background-color: #fff; background-clip: padding-box; border: 1px solid rgba(0, 0, 0, .1); box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); } @media (min-width:768px) { .algolia-autocomplete .ds-dropdown-menu { width: 175% } } .algolia-autocomplete .ds-dropdown-menu::before { display: none } .algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { padding: 0; background-color: rgb(255,255,255); border: 0; max-height: 80vh; } .algolia-autocomplete .ds-dropdown-menu .ds-suggestions { margin-top: 0 } .algolia-autocomplete .algolia-docsearch-suggestion { padding: 0; overflow: visible } .algolia-autocomplete .algolia-docsearch-suggestion--category-header { padding: .125rem 1rem; margin-top: 0; font-size: 1.3em; font-weight: 500; color: #00008B; border-bottom: 0 } .algolia-autocomplete .algolia-docsearch-suggestion--wrapper { float: none; padding-top: 0 } .algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { float: none; width: auto; padding: 0; text-align: left } .algolia-autocomplete .algolia-docsearch-suggestion--content { float: none; width: auto; padding: 0 } .algolia-autocomplete .algolia-docsearch-suggestion--content::before { display: none } .algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { padding-top: .75rem; margin-top: .75rem; border-top: 1px solid rgba(0, 0, 0, .1) } .algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { display: block; padding: .1rem 1rem; margin-bottom: 0.1; font-size: 1.0em; font-weight: 400 /* display: none */ } .algolia-autocomplete .algolia-docsearch-suggestion--title { display: block; padding: .25rem 1rem; margin-bottom: 0; font-size: 0.9em; font-weight: 400 } .algolia-autocomplete .algolia-docsearch-suggestion--text { padding: 0 1rem .5rem; margin-top: -.25rem; font-size: 0.8em; font-weight: 400; line-height: 1.25 } .algolia-autocomplete .algolia-docsearch-footer { width: 110px; height: 20px; z-index: 3; margin-top: 10.66667px; float: right; font-size: 0; line-height: 0; } .algolia-autocomplete .algolia-docsearch-footer--logo { background-image: url("data:image/svg+xml;utf8,"); background-repeat: no-repeat; background-position: 50%; background-size: 100%; overflow: hidden; text-indent: -9000px; width: 100%; height: 100%; display: block; transform: translate(-8px); } .algolia-autocomplete .algolia-docsearch-suggestion--highlight { color: #FF8C00; background: rgba(232, 189, 54, 0.1) } .algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) } .algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { background-color: rgba(192, 192, 192, .15) } ================================================ FILE: docs/docsearch.js ================================================ $(function() { // register a handler to move the focus to the search bar // upon pressing shift + "/" (i.e. "?") $(document).on('keydown', function(e) { if (e.shiftKey && e.keyCode == 191) { e.preventDefault(); $("#search-input").focus(); } }); $(document).ready(function() { // do keyword highlighting /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ var mark = function() { var referrer = document.URL ; var paramKey = "q" ; if (referrer.indexOf("?") !== -1) { var qs = referrer.substr(referrer.indexOf('?') + 1); var qs_noanchor = qs.split('#')[0]; var qsa = qs_noanchor.split('&'); var keyword = ""; for (var i = 0; i < qsa.length; i++) { var currentParam = qsa[i].split('='); if (currentParam.length !== 2) { continue; } if (currentParam[0] == paramKey) { keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); } } if (keyword !== "") { $(".contents").unmark({ done: function() { $(".contents").mark(keyword); } }); } } }; mark(); }); }); /* Search term highlighting ------------------------------*/ function matchedWords(hit) { var words = []; var hierarchy = hit._highlightResult.hierarchy; // loop to fetch from lvl0, lvl1, etc. for (var idx in hierarchy) { words = words.concat(hierarchy[idx].matchedWords); } var content = hit._highlightResult.content; if (content) { words = words.concat(content.matchedWords); } // return unique words var words_uniq = [...new Set(words)]; return words_uniq; } function updateHitURL(hit) { var words = matchedWords(hit); var url = ""; if (hit.anchor) { url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; } else { url = hit.url + '?q=' + escape(words.join(" ")); } return url; } ================================================ FILE: docs/index.html ================================================ Time Series Missing Value Imputation • imputeTS

    Project Status: Active The project has reached a stable, usable state and is being actively developed. R-CMD-check Codecov test coverage CRAN Version CRAN Release CRAN Downloads

    The imputeTS package specializes on (univariate) time series imputation. It offers several different imputation algorithm implementations. Beyond the imputation algorithms the package also provides plotting and printing functions of time series missing data statistics. Additionally three time series datasets for imputation experiments are included.

    Installation

    The imputeTS package can be found on CRAN. For installation execute in R:

     install.packages("imputeTS")

    If you want to install the latest version from GitHub (can be unstable) run:

    library(devtools)
    install_github("SteffenMoritz/imputeTS")

    Usage

    • Imputation

      To impute (fill all missing values) in a time series x, run the following command:

       na_interpolation(x)

      Output is the time series x with all NA’s replaced by reasonable values.

      This is just one example for an imputation algorithm. In this case interpolation was the algorithm of choice for calculating the NA replacements. There are several other algorithms (see also under caption Imputation Algorithms). All imputation functions are named alike starting with na_ followed by a algorithm label e.g. na_mean, na_kalman, …

    • Plotting

      To plot missing data statistics for a time series x, run the following command:

       ggplot_na_distribution(x)

       

      Example ggplot_na_distribution plot

      This is also just one example for a plot. Overall there are five different types of missing data plots (see also under caption Missing Data Plots). There is also an additional tutorial just about plots - the Gallery of Visualizations.

    • Printing

      To print statistics about the missing data in a time series x, run the following command:

       statsNA(x)
    • Example Datasets

      To load the ‘heating’ time series (with missing values) into a variable y and the ‘heating’ time series (without missing values) into a variable z, run:

       y <- tsHeating
       z <- tsHeatingComplete

      There are three datasets provided with the package, the ‘tsHeating’, the ‘tsAirgap’ and the ‘tsNH4’ time series (see also under caption Datasets).

    Imputation Algorithms

    Here is a table with available algorithms to choose from:

    Function Description
    na_interpolation Missing Value Imputation by Interpolation
    na_kalman Missing Value Imputation by Kalman Smoothing
    na_locf Missing Value Imputation by Last Observation Carried Forward
    na_ma Missing Value Imputation by Weighted Moving Average
    na_mean Missing Value Imputation by Mean Value
    na_random Missing Value Imputation by Random Sample
    na_remove Remove Missing Values
    na_replace Replace Missing Values by a Defined Value
    na_seadec Seasonally Decomposed Missing Value Imputation
    na_seasplit Seasonally Splitted Missing Value Imputation

    This is a rather broad overview. The functions itself mostly offer more than just one algorithm. For example na_interpolation can be set to linear or spline interpolation.

    More detailed information about the algorithms and their options can be found in the imputeTS reference manual.

    Missing Data Plots

    Here is a table with available plots to choose from:

    Function Description
    ggplot_na_distribution Visualize Distribution of Missing Values
    ggplot_na_distribution2 Missing Values Summarized in Time Intervals
    ggplot_na_gapsize Visualize Distribution of NA Gapsizes
    ggplot_na_gapsize2 Visualize Total NAs of Different NA Gapsizes
    ggplot_na_imputations Visualize Imputed Values

    More detailed information about the plots can be found in the imputeTS reference manual and in the Gallery of Visualizations.

    Datasets

    There are three datasets (each in two versions) available:

    Dataset Description
    tsAirgap Time series of monthly airline passengers (with NAs)
    tsAirgapComplete Time series of monthly airline passengers (complete)
    tsHeating Time series of a heating systems supply temperature (with NAs)
    tsHeatingComplete Time series of a heating systems supply temperature (complete)
    tsNH4 Time series of NH4 concentration in a wastewater system (with NAs)
    tsNH4Complete Time series of NH4 concentration in a wastewater system (complete)

    The tsAirgap, tsHeating and tsNH4 time series are with NAs. Their complete versions are without NAs. Except the missing values their versions are identical. The NAs for the time series were artifically inserted by simulating the missing data pattern observed in similar non-complete time series from the same domain. Having a complete and incomplete version of the same dataset is useful for conducting experiments of imputation functions.

    More detailed information about the datasets can be found in the imputeTS reference manual.

    Reference

    You can cite imputeTS the following:

    Moritz, Steffen, and Bartz-Beielstein, Thomas. “imputeTS: Time Series Missing Value Imputation in R.” R Journal 9.1 (2017). doi: 10.32614/RJ-2017-009.

    Need Help?

    If you have general programming problems or need help using the package please ask your question on StackOverflow. By doing so all users will be able to benefit in the future from your question.

    Don’t forget to mark your question with the imputets tag on StackOverflow to get me notified

    Support

    If you found a bug or have suggestions, feel free to get in contact via steffen.moritz10 at gmail.com.

    All feedback is welcome

    Version

    3.4

    License

    GPL-3

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/news/index.html ================================================ Changelog • imputeTS
    • Added ggplot_na_gapsize2 plot (and unit tests). Nice way to illustrate how different NA gapsizes (consecutive NAs in a row) amount for NA totals.

    • Fix of the CITATION file to comply with newer CRAN rules

    • Update of ggplot_na_imputations() to avoid using now depreciated ggplot2 options

    • Update of unit tests for all plotting functions (ggplot_na_…). Now using is_ggplot() to check for correct output. This was necessary because of a major ggplot2 update (switch to s7 classes).

    Thanks to Sabrina Krys, Kevin Villalobos, Tracy Shen, hezhichao1991, englianhu for bug / issue reporting. Thanks to RicardaP for fixing documentation error. Thanks to Ronald Hause for the commit to optimize parameter pass trough from approx to na_interpolation.

    • Renamed ggplot_na_intervals to ggplot_na_distribution2

    • Updates to ggplot_na_gapsize: Space between the bars adjusted for better optics. Added parameters for directly choosing the bar border color and alpha value for filling of the bars.

    • Improved notification message for na_seadec/na_seasplit when find_frequency couldn’t find a seasonal pattern.

    • Corrected error in na_kalman documentation - auto.arima was wrongly described as default parameter choice, while in reality it is StructTS (reported by RicardaP)

    • Changes for the error handling. (These changes got reverted and did not make it into the CRAN release). For some specific cases the input checks performed by imputeTS stop pipe workflows in their entirety. E.g. a problem when group_by leads to all NA subsets - which fail the input check and then stop the whole pipe workflow. To prevent this, stop() is only called, when the user supplied imputeTS algorithm parameter options are wrong or misspelled. Unsupported input data will only give a warning() (and do not perform any action on the data). Thus, there is no call to stop(), that cancels the whole pipe workflow. (issue reported by Sabrina Krys). This works fine, but after closer consideration we figured people fail to notice warnings way too often and thus it is more user friendly to clearly stop with an error for these issues. After all, the users data analysis clearly profits from taking a closer look in these specific cases. If you are anyhow interested in the version without the reverted changes, it can be installed from github with the following command: devtools::install_github(“https://github.com/SteffenMoritz/imputeTS/commit/aaf759216b4091e36dee6e8e3a10185ff8f4647b”)

    • Improved error messages (especially for multivariate inputs) and unit tests for the warnings and errors.

    • Corrected typo in ‘Input data needs at least x non-NA data points’ error message

    • Better parameter pass trough from approx to na_interpolation- Added capability to alter rule for linear extrapolation outside the interval [min(x), max(x)] (commit by Ronald Hause)

    • Improved na_interpolation documentation (more information about possible parameter pass through from underlying spline, approx,stinterp functions)

    • Additional unit tests

    • Moved to Github Actions instead of TravisCI / AppVeyor.

    • Bugfix for “Error in optim(init[mask], getLike, method =”L-BFGS-B”, lower = rep(0, : L-BFGS-B needs finite values of ‘fn’.”, which comes for completely constant input to na_kalman e.g. 4,4,4,NA,4,4. (reported by Kevin Villalobos, Tracy Shen, hezhichao1991, englianhu)

    • Improved na_seadec documentation (algorithm details)

    • Changed R Version requirement in Description to R (>= 3.6) since imported packages like ggtext and also some testthat tests were already requiring newer versions than the old R (≥ 3.0.1) requirement of imputeTS

    Thanks to Mark J. Lamias for bug / issue reporting. Thanks to Cyrus Mohammadian for bug reporting. Thanks to Miroslaw Janik for issue reporting.

    • Fix to remove CRAN note - removed not used utils from DESCRIPTION imports

    • Minor fix to ggplot_na_distribution (bars end now at max(timeseries)*1.05)

    • Typo corrections in statsNA

    • Specified ggplot2 (>= 3.3.0) in imports, to prevent errors with older ggplot2 versions (reported by Cyrus Mohammadian)

    • Updated na_locf documentation to make behavior of na_remaining parameter more clear (issue reported by Mark J. Lamias)

    • ggplot_na_intervals, has now percentages with % sign (e.g. 10%) on y-scale instead of just numbers (e.g. 0,1) (suggestion from Miroslaw Janik)

    • Added some figures and the Cheat Sheet .pptx to .Rbuildignore to avoid CRAN warning about
      package size. These files and figures were not needed for the CRAN version.

    Thanks to Johannes Menzel for bug reporting, Thanks to Jan (jmablans) for bug reporting. Thanks to Earo Wang for speedup of plotNA.gapsize. Special Thanks to Sebastian Gatscha for plotting functions, new na_mean options, new unit tests.

    • Plotting functions are all in ggplot now (way better looking). Additionally they got renamed accordingly ggplot_na_distribution, ggplot_na_intervals, ggplot_na_gapsize, ggplot_na_imputations.

    • Speedup for plotNA_gapsize calculation (now renamed ggplot_na_gapsize) (thx to Earo Wang)

    • Added harmonic and geometric mean as option for na_mean

    • Removed bug in na_replace - it can now be used with all NA vectors since it requires no minimum of non-NA values (reported by Jan - jmablans)

    • Improved na.random input check (usable with all NA input now if upper and lower bound parameters are explicitly set to numeric values)

    • Additional unit tests for the plotting functions

    • Additional unit tests for the all imputation functions (testing all NA input)

    • Update for testthat unit tests

    • Fixed a mistake in README.md (reported by Johannes Menzel)

    • Added to statsNA: Number of Gaps, Average Gap Size + reformatting of code + compatibility with other ts objects

    • Documentation improvements through newer roxygen version (Markup now possible in documentation)

    • updated Readme + Vignette to new function names

    • Added the imputeTS Cheat Sheet as Vignette

    • Added new vignette Gallery Missing Data Visualizations

    • Added revdep

    Thanks to Jim Maas, shreydesai, Breza, CameronNemo for reporting bugs. Thanks to Sebastian Gatscha providing the (way faster) C++ na.ma() implementation.

    • tibble and tstibble compatibility

    • Reworked internal code documentation

    • na.ma speed up via C++

    • Changed vignette builder to R.rsp

    • Used R package styler package to optimize source code readability

    • Made some changes to better follow tidyverse style guide

    • Replaced na. with na_ e.g. na.mean with na_mean usw.This fits better to modern code style guidelines. The old function names will still work for a while, but give a warning.

    • Added findFrequency option to na.seadec and na.seasplit

    • Added maxgap option

    • Fixed bug for na.seadec - also imputed known values in some special cases (reported by CameronNemo)

    • Added doi: 10.32614/RJ-2017-009 to describtion, references, readme and citation file

    • Added StackExchange link to Readme

    • Moved stinepack from imports to suggested

    • Internal reorganization of imports - now always using pkg::function and importFrom pkg x1 x2 x3instead of just import pkg

    • Fixed bug in na.ma when using xts time series with NA at the end

    • Fixed error message in na.interpolation if wrong parameter is given
      stop(“Wrong parameter ‘option’ given. Value must be either ‘linear’, ‘spline’ or ‘stine’.”) (reported by Breza)

    • Fixed spelling mistakes in na.seadec and na.seasplit (reported by shreydesai)

    • Fixed bug with na.random() output (reported by Jim Maas)

    • Updated Description: Orcid Id added, packages required for unit test add as “Suggested”

    • Small correction in README.md, small update to citation file

    • Replaced NEWS with NEWS.md for better formatting

    • Updated citation file

    • Minor changes to vignette

    • Adjusted unit test to a update of forecast package
    • Small speed improvments for na.kalman

    • Improved input check for all functions

    • Bugfix for unit tests

    • Changes to unit test (because of zoo update)

    • Bugfix for na.kalman with integer input

    • Readme Update

    • Improved error messages for na.seasplit and na.seadec

    • Minor vignette changes

    • Bugfix for na.locf (also concerned na.kalman)
    • Fixed for problems with Solaris/Sparc

    • Fixes for problems with vignette on osx

    • Bugfix for plots without missing data

    • Increased performance for na.locf

    • Minor bugfixes for specific data.frame inputs

    • Minor bugfixes for specific xts object inputs

    • Improved Code Documentation

    • Added new software tests

    • Added Vignette
    • Computation time improvments for na.locf (up to 10000 times faster)

    • Computation time improvments for na.interpolation (up to 10000 times faster)

    • Computation time improvments for na.kalman (only slightly faster, under 10%)

    • Fixed unnecessary warning message with some na.kalman options

    • Adjusted default parameters for plotNA.distributionBar (using nclass.Sturges for breaks parameter)

    • Fixed issue with too sensitive input checking

    • Enabled usage of multivariate input (data.frame, mts, matrix,…) for all imputation functions except na.remove. This means users do not have to loop through all columns by themselfes anymore if they want to use the package with multivariate data. The imputation itself is still performend in univariate manner (column after column).

    • Improved compatibility with different advanced time series objects like zoo and xts. Using the imputation functions with these time series objects should be possible now. These series will not be explicitly named as possible input in the user documentation. Absence of errors can not be guaranteed. However, there are no known issues yet.

    • Added several things for unit tests with pkg ‘testthat’

    • Added unit tests for every function

    • Adjusted error messages

    • Internal Coding style improvement: replaced all T with TRUE and all F with FALSE

    • Adjustment tsHeating / tsHeatingComplete datasets (set 1440 as frequency parameter)

    • Adjustment tsNH4 / tsNH4Complete datasets (set 144 as frequency parameter)

    • Fixes for grammar, spelling and citations in the whole documentation

    • Revised examples in the documentation for all functions

    • Restricted output of na.remove to vector only (issue with incorrect time information otherwise)

    • Added better x-axes labels for plotNA.distribution

    • Added github links to description file

    • Added citation file

    • Updated Readme (badges for travis ci and cran status)

    • Fix in documentation for na.interpolation (due to outdated descriptions)

    • Fix in documentation plotNA.distribution / plotNA.distributionBar (due to interchanged descriptions)

    • Added references to used packages in na.kalman and na.interpolation documentation

    • Allows now also numeric vectors as input

    • Removed na.identifier parameter for all functions (too error prone, better handled individually by the user)

    • Minor changes in na.interpolation with option = “stine”

    • Added na.ma imputation function

    • Replaced “data” in all function parameters with the more common “x”

    • Improvement of all code examples

    • Renamed heating/heatingComplete dataset to tsHeating/tsHeatingComplete

    • Renamed nh4/nh4Complete dataset to tsNH4/tsNH4Complete

    • Added tsAirgap / tsAirgapComplete datasets

    • Improved imputeTS-package documentation

    • Added na.kalman imputation function

    • Added README.md function

    • Added statsNA function

    • Added plotNA.gapsize function

    • Renamed vis.imputations to plotNA.imputations

    • Renamed vis.barMissing to plotNA.distributionBar

    • Renamed vis.missing to plotNA.distribution

    • Fixed issues with parameter pass through and legend for all plotting functions

    • Improved dataset documentation

    • Update of vis.differences (better looking plot now)

    • Added vis.missing to visualize the distribution of missing data in a time series

    • Added vis.barMissing, which is especially suited to visualize missing data in very huge time series

    • Update na.interpolate (added Stineman interpolation and enabled … parameter for all interpolation algorithms to pass through parameters to the underlying functions)

    • Added two datasets of sensor data

    • vis.differences for plotting differences between real and imputed values

    • Removed internal functions from visible package documentation

    • Added additional algorithms: na.seasplit and na.seadec

    • internal function for algorithm selection

    • Created initial version of imputeTS package for univariate time series imputation

    • added the simple imputation functions: na.locf, na.mean, na.random, na.interpolation, na.replace

    • added na.remove function for removing all NAs from a time series

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/pkgdown.css ================================================ /* Sticky footer */ /** * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css * * .Site -> body > .container * .Site-content -> body > .container .row * .footer -> footer * * Key idea seems to be to ensure that .container and __all its parents__ * have height set to 100% * */ html, body { height: 100%; } body { position: relative; } body > .container { display: flex; height: 100%; flex-direction: column; } body > .container .row { flex: 1 0 auto; } footer { margin-top: 45px; padding: 35px 0 36px; border-top: 1px solid #e5e5e5; color: #666; display: flex; flex-shrink: 0; } footer p { margin-bottom: 0; } footer div { flex: 1; } footer .pkgdown { text-align: right; } footer p { margin-bottom: 0; } img.icon { float: right; } /* Ensure in-page images don't run outside their container */ .contents img { max-width: 100%; height: auto; } /* Fix bug in bootstrap (only seen in firefox) */ summary { display: list-item; } /* Typographic tweaking ---------------------------------*/ .contents .page-header { margin-top: calc(-60px + 1em); } dd { margin-left: 3em; } /* Section anchors ---------------------------------*/ a.anchor { display: none; margin-left: 5px; width: 20px; height: 20px; background-image: url(./link.svg); background-repeat: no-repeat; background-size: 20px 20px; background-position: center center; } h1:hover .anchor, h2:hover .anchor, h3:hover .anchor, h4:hover .anchor, h5:hover .anchor, h6:hover .anchor { display: inline-block; } /* Fixes for fixed navbar --------------------------*/ .contents h1, .contents h2, .contents h3, .contents h4 { padding-top: 60px; margin-top: -40px; } /* Navbar submenu --------------------------*/ .dropdown-submenu { position: relative; } .dropdown-submenu>.dropdown-menu { top: 0; left: 100%; margin-top: -6px; margin-left: -1px; border-radius: 0 6px 6px 6px; } .dropdown-submenu:hover>.dropdown-menu { display: block; } .dropdown-submenu>a:after { display: block; content: " "; float: right; width: 0; height: 0; border-color: transparent; border-style: solid; border-width: 5px 0 5px 5px; border-left-color: #cccccc; margin-top: 5px; margin-right: -10px; } .dropdown-submenu:hover>a:after { border-left-color: #ffffff; } .dropdown-submenu.pull-left { float: none; } .dropdown-submenu.pull-left>.dropdown-menu { left: -100%; margin-left: 10px; border-radius: 6px 0 6px 6px; } /* Sidebar --------------------------*/ #pkgdown-sidebar { margin-top: 30px; position: -webkit-sticky; position: sticky; top: 70px; } #pkgdown-sidebar h2 { font-size: 1.5em; margin-top: 1em; } #pkgdown-sidebar h2:first-child { margin-top: 0; } #pkgdown-sidebar .list-unstyled li { margin-bottom: 0.5em; } /* bootstrap-toc tweaks ------------------------------------------------------*/ /* All levels of nav */ nav[data-toggle='toc'] .nav > li > a { padding: 4px 20px 4px 6px; font-size: 1.5rem; font-weight: 400; color: inherit; } nav[data-toggle='toc'] .nav > li > a:hover, nav[data-toggle='toc'] .nav > li > a:focus { padding-left: 5px; color: inherit; border-left: 1px solid #878787; } nav[data-toggle='toc'] .nav > .active > a, nav[data-toggle='toc'] .nav > .active:hover > a, nav[data-toggle='toc'] .nav > .active:focus > a { padding-left: 5px; font-size: 1.5rem; font-weight: 400; color: inherit; border-left: 2px solid #878787; } /* Nav: second level (shown on .active) */ nav[data-toggle='toc'] .nav .nav { display: none; /* Hide by default, but at >768px, show it */ padding-bottom: 10px; } nav[data-toggle='toc'] .nav .nav > li > a { padding-left: 16px; font-size: 1.35rem; } nav[data-toggle='toc'] .nav .nav > li > a:hover, nav[data-toggle='toc'] .nav .nav > li > a:focus { padding-left: 15px; } nav[data-toggle='toc'] .nav .nav > .active > a, nav[data-toggle='toc'] .nav .nav > .active:hover > a, nav[data-toggle='toc'] .nav .nav > .active:focus > a { padding-left: 15px; font-weight: 500; font-size: 1.35rem; } /* orcid ------------------------------------------------------------------- */ .orcid { font-size: 16px; color: #A6CE39; /* margins are required by official ORCID trademark and display guidelines */ margin-left:4px; margin-right:4px; vertical-align: middle; } /* Reference index & topics ----------------------------------------------- */ .ref-index th {font-weight: normal;} .ref-index td {vertical-align: top; min-width: 100px} .ref-index .icon {width: 40px;} .ref-index .alias {width: 40%;} .ref-index-icons .alias {width: calc(40% - 40px);} .ref-index .title {width: 60%;} .ref-arguments th {text-align: right; padding-right: 10px;} .ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} .ref-arguments .name {width: 20%;} .ref-arguments .desc {width: 80%;} /* Nice scrolling for wide elements --------------------------------------- */ table { display: block; overflow: auto; } /* Syntax highlighting ---------------------------------------------------- */ pre, code, pre code { background-color: #f8f8f8; color: #333; } pre, pre code { white-space: pre-wrap; word-break: break-all; overflow-wrap: break-word; } pre { border: 1px solid #eee; } pre .img, pre .r-plt { margin: 5px 0; } pre .img img, pre .r-plt img { background-color: #fff; } code a, pre a { color: #375f84; } a.sourceLine:hover { text-decoration: none; } .fl {color: #1514b5;} .fu {color: #000000;} /* function */ .ch,.st {color: #036a07;} /* string */ .kw {color: #264D66;} /* keyword */ .co {color: #888888;} /* comment */ .error {font-weight: bolder;} .warning {font-weight: bolder;} /* Clipboard --------------------------*/ .hasCopyButton { position: relative; } .btn-copy-ex { position: absolute; right: 0; top: 0; visibility: hidden; } .hasCopyButton:hover button.btn-copy-ex { visibility: visible; } /* headroom.js ------------------------ */ .headroom { will-change: transform; transition: transform 200ms linear; } .headroom--pinned { transform: translateY(0%); } .headroom--unpinned { transform: translateY(-100%); } /* mark.js ----------------------------*/ mark { background-color: rgba(255, 255, 51, 0.5); border-bottom: 2px solid rgba(255, 153, 51, 0.3); padding: 1px; } /* vertical spacing after htmlwidgets */ .html-widget { margin-bottom: 10px; } /* fontawesome ------------------------ */ .fab { font-family: "Font Awesome 5 Brands" !important; } /* don't display links in code chunks when printing */ /* source: https://stackoverflow.com/a/10781533 */ @media print { code a:link:after, code a:visited:after { content: ""; } } /* Section anchors --------------------------------- Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 */ div.csl-bib-body { } div.csl-entry { clear: both; } .hanging-indent div.csl-entry { margin-left:2em; text-indent:-2em; } div.csl-left-margin { min-width:2em; float:left; } div.csl-right-inline { margin-left:2em; padding-left:1em; } div.csl-indent { margin-left: 2em; } ================================================ FILE: docs/pkgdown.js ================================================ /* http://gregfranko.com/blog/jquery-best-practices/ */ (function($) { $(function() { $('.navbar-fixed-top').headroom(); $('body').css('padding-top', $('.navbar').height() + 10); $(window).resize(function(){ $('body').css('padding-top', $('.navbar').height() + 10); }); $('[data-toggle="tooltip"]').tooltip(); var cur_path = paths(location.pathname); var links = $("#navbar ul li a"); var max_length = -1; var pos = -1; for (var i = 0; i < links.length; i++) { if (links[i].getAttribute("href") === "#") continue; // Ignore external links if (links[i].host !== location.host) continue; var nav_path = paths(links[i].pathname); var length = prefix_length(nav_path, cur_path); if (length > max_length) { max_length = length; pos = i; } } // Add class to parent
  • , and enclosing
  • if in dropdown if (pos >= 0) { var menu_anchor = $(links[pos]); menu_anchor.parent().addClass("active"); menu_anchor.closest("li.dropdown").addClass("active"); } }); function paths(pathname) { var pieces = pathname.split("/"); pieces.shift(); // always starts with / var end = pieces[pieces.length - 1]; if (end === "index.html" || end === "") pieces.pop(); return(pieces); } // Returns -1 if not found function prefix_length(needle, haystack) { if (needle.length > haystack.length) return(-1); // Special case for length-0 haystack, since for loop won't run if (haystack.length === 0) { return(needle.length === 0 ? 0 : -1); } for (var i = 0; i < haystack.length; i++) { if (needle[i] != haystack[i]) return(i); } return(haystack.length); } /* Clipboard --------------------------*/ function changeTooltipMessage(element, msg) { var tooltipOriginalTitle=element.getAttribute('data-original-title'); element.setAttribute('data-original-title', msg); $(element).tooltip('show'); element.setAttribute('data-original-title', tooltipOriginalTitle); } if(ClipboardJS.isSupported()) { $(document).ready(function() { var copyButton = ""; $("div.sourceCode").addClass("hasCopyButton"); // Insert copy buttons: $(copyButton).prependTo(".hasCopyButton"); // Initialize tooltips: $('.btn-copy-ex').tooltip({container: 'body'}); // Initialize clipboard: var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { text: function(trigger) { return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); } }); clipboardBtnCopies.on('success', function(e) { changeTooltipMessage(e.trigger, 'Copied!'); e.clearSelection(); }); clipboardBtnCopies.on('error', function() { changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); }); }); } })(window.jQuery || window.$) ================================================ FILE: docs/pkgdown.yml ================================================ pandoc: '3.4' pkgdown: 2.1.3 pkgdown_sha: ~ articles: gallery_visualizations: gallery_visualizations.html last_built: 2025-08-24T22:33Z urls: reference: https://SteffenMoritz.github.io/imputeTS/reference article: https://SteffenMoritz.github.io/imputeTS/articles ================================================ FILE: docs/reference/ggplot_na_distribution.html ================================================ Line Plot to Visualize the Distribution of Missing Values — ggplot_na_distribution • imputeTS

    Visualize the distribution of missing values within a time series.

    ggplot_na_distribution(
      x,
      x_axis_labels = NULL,
      color_points = "steelblue",
      color_lines = "steelblue2",
      color_missing = "indianred",
      color_missing_border = "indianred",
      alpha_missing = 0.5,
      title = "Distribution of Missing Values",
      subtitle = "Time Series with highlighted missing regions",
      xlab = "Time",
      ylab = "Value",
      shape_points = 20,
      size_points = 2.5,
      theme = ggplot2::theme_linedraw()
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.

    x_axis_labels

    For adding specific x-axis labels. Takes a vector of Date or POSIXct objects as an input (needs the same length as x) . The Default (NULL) uses the observation numbers as x-axis tick labels.

    color_points

    Color for the Symbols/Points.

    color_lines

    Color for the Lines.

    color_missing

    Color used for highlighting the time spans with NA values.

    color_missing_border

    Color used as border for time spans with NA values.

    alpha_missing

    Alpha (transparency) value used for color_missing.

    title

    Title of the Plot (NULL for deactivating title).

    subtitle

    Subtitle of the Plot (NULL for deactivating subtitle).

    xlab

    Label for x-Axis.

    ylab

    Label for y-Axis.

    shape_points

    Symbol to use for the Observations/Points. See https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference.

    size_points

    Size of Symbols/Points.

    theme

    Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (theme_linedraw)

    Details

    This function visualizes the distribution of missing values within a time series. If a value is NA, the background is colored differently. This gives a good overview of where most missing values occur.

    The only really needed parameter for this function is x (the univariate time series that shall be visualized). All other parameters are solely for altering the appearance of the plot.

    As long as the input is univariate and numeric the function also takes data.frame, tibble, tsibble, zoo, xts as an input.

    The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made.

    For very long time series it might happen, that the plot gets too crowded and overplotting issues occur. In this case the ggplot_na_distribution2 plotting function can provide a more condensed overview.

    Author

    Steffen Moritz, Sebastian Gatscha

    Examples

    # Example 1: Visualize the missing values in x
    x <- stats::ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7))
    ggplot_na_distribution(x)
    
    
    # Example 2: Visualize the missing values in tsAirgap time series
    ggplot_na_distribution(tsAirgap)
    
    
    # Example 3: Same as example 1, just written with pipe operator
    x <- ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7))
    x %>% ggplot_na_distribution()
    
    
    # Example 4: Visualize NAs in tsAirgap - different color for points
    # Plot adjustments via ggplot_na_distribution function parameters
    ggplot_na_distribution(tsAirgap, color_points = "grey")
    
    
    # Example 5: Visualize NAs in tsAirgap - different theme
    # Plot adjustments via ggplot_na_distribution function parameters
    ggplot_na_distribution(tsAirgap, theme = ggplot2::theme_classic())
    
    
    # Example 6: Visualize NAs in tsAirgap - title, subtitle in center
    # Plot adjustments via ggplot2 syntax
    ggplot_na_distribution(tsAirgap) +
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) +
      ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5))
    
    
    # Example 7: Visualize NAs in tsAirgap - title in center, no subtitle
    # Plot adjustments via ggplot2 syntax and function parameters
    ggplot_na_distribution(tsAirgap, subtitle = NULL) +
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
    
    
    # Example 8: Visualize NAs in tsAirgap - x-axis texts with angle
    # Plot adjustments via ggplot2 syntax and function parameters
    ggplot_na_distribution(tsAirgap, color_points = "grey") +
      ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1))
    
      
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/ggplot_na_distribution2.html ================================================ Stacked Bar Plot to Visualize Missing Values per Time Interval — ggplot_na_distribution2 • imputeTS

    Visualization of missing values in barplot form. Especially useful when looking at specific intervals and for time series with a lot of observations.

    ggplot_na_distribution2(
      x,
      number_intervals = NULL,
      interval_size = NULL,
      measure = "percent",
      color_missing = "indianred2",
      color_existing = "steelblue",
      alpha_missing = 0.8,
      alpha_existing = 0.3,
      title = "Missing Values per Interval",
      subtitle = "Amount of NA and non-NA for successive intervals",
      xlab = "Time Lapse (Interval Size: XX)",
      ylab = NULL,
      color_border = "white",
      theme = ggplot2::theme_linedraw()
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.

    number_intervals

    Defines the number of bins to be created. Default number of intervals (denoted by NULL) is calculated by nclass.Sturges using Sturges' formula. If the interval_size parameter is set to a value different to NULL this parameter is ignored.

    interval_size

    Defines how many observations should be in one bin/interval. The required number of overall bins is afterwards calculated automatically. If used this parameter overwrites the number_intervals parameter. For a very long time series be sure to make the interval_size not extremely small, otherwise because of overplotting issues nothing can be seen until you also increase the plot width.

    measure

    Whether the NA / non-NA ratio should be given as percent or absolute numbers.

    • "percent" - for percentages

    • "count" - for absolute numbers of NAs

    color_missing

    Color for the amount of missing values.

    color_existing

    Color for the amount of existing values.

    alpha_missing

    Alpha (transparency) value for the missing values.

    alpha_existing

    Alpha (transparency) value for the existing values.

    title

    Title of the Plot (NULL for deactivating title).

    subtitle

    Subtitle of the Plot (NULL for deactivating subtitle).

    xlab

    Label for x-Axis. Automatically set to the current interval size, if no custom text is chosen.

    ylab

    Label for y-Axis. As default (NULL), the axis is automatically set to either 'Percent' or 'Count' dependent on the settings of parameter measure.

    color_border

    Color for the small borders between the intervals/bins. Default is 'white'.

    theme

    Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (theme_linedraw)

    Details

    This function visualizes the distribution of missing values within a time series. In comparison to the ggplot_na_distribution function this is not done by plotting each observation of the time series separately. Instead observations for time intervals are represented as intervals/bins of multiple values. For these intervals information about the amount of missing values are shown. This has the advantage, that also for large time series a plot which is easy to overview can be created.

    The only really needed parameter for this function is x (the univariate time series that shall be visualized). All other parameters are solely for altering the appearance of the plot.

    As long as the input is univariate and numeric the function also takes data.frame, tibble, tsibble, zoo, xts as an input.

    The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made.

    Author

    Steffen Moritz, Sebastian Gatscha

    Examples

    # Example 1: Visualize the missing values in tsNH4 time series as percentages
    ggplot_na_distribution2(tsNH4)
    
    
    # Example 2: Visualize the missing values in tsNH4 time series as counts 
    ggplot_na_distribution2(tsNH4, measure = "count")
    
    
    # Example 3: Visualize the missing values in tsHeating time series
    ggplot_na_distribution2(tsHeating)
    
    
    # Example 4: Same as example 1, just written with pipe operator
    tsNH4 %>% ggplot_na_distribution2()
    
    
    # Example 5: Visualize NAs in tsNH4 - exactly 8 intervals
    ggplot_na_distribution2(tsNH4, number_intervals = 8)
    
    
    # Example 6: Visualize NAs in tsNH4 - 300 observations per interval
    ggplot_na_distribution2(tsNH4, interval_size = 300)
    
    
    # Example 7: Visualize NAs in tsAirgap - different color for NAs
    # Plot adjustments via ggplot_na_distribution2 function parameters
    ggplot_na_distribution2(tsAirgap, color_missing = "pink")
    
    
    # Example 8: Visualize NAs in tsNH4 - different theme
    # Plot adjustments via ggplot_na_distribution2 function parameters
    ggplot_na_distribution2(tsNH4, theme = ggplot2::theme_classic())
    
    
    # Example 9: Visualize NAs in tsAirgap - title, subtitle in center
    # Plot adjustments via ggplot2 syntax
    ggplot_na_distribution2(tsAirgap) +
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) +
      ggplot2::theme(plot.subtitle = ggtext::element_markdown(hjust = 0.5))
    
    
    # Example 10: Visualize NAs in tsAirgap - title in center, no subtitle
    # Plot adjustments via ggplot2 syntax and function parameters
    ggplot_na_distribution2(tsAirgap, subtitle = NULL) +
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
    
    
    # Example 11: Visualize NAs in tsAirgap - x-axis texts with angle
    # Plot adjustments via ggplot2 syntax and function parameters
    ggplot_na_distribution2(tsAirgap, color_missing = "grey") +
      ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1))
    
      
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/ggplot_na_gapsize.html ================================================ Bar Plot to Visualize Occurrences of Different NA Gap Sizes — ggplot_na_gapsize • imputeTS

    Visualize the Number of Occurrences for existing NA Gap Sizes (NAs in a row) in a Time Series

    ggplot_na_gapsize(
      x,
      limit = 10,
      include_total = TRUE,
      ranked_by = "occurrence",
      color_occurrence = "indianred",
      color_total = "steelblue",
      color_border = "black",
      alpha_bars = 1,
      title = "Occurrence of gap sizes",
      subtitle = "Gap sizes (NAs in a row) ordered by most common",
      xlab = NULL,
      ylab = "Number occurrence",
      legend = TRUE,
      orientation = "horizontal",
      label_occurrence = "Number occurrence gapsize",
      label_total = "Resulting NAs for gapsize",
      theme = ggplot2::theme_linedraw()
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.

    limit

    Specifies how many of the most common gap sizes are shown in the plot.Default is 10. So only the 10 most often occurring gapsizes will be shown. If more or all present gap sizes should be displayed, the limit needs to be increased. Since this might add a lot of additional data, having parameter orientation set to 'horizontal' avoids overlaps in the axis labels.

    include_total

    When set to TRUE the total NA count for a gapsize is included in the plot (total = number occurrence x gap size). E.g. if a gapsize of 3 occurs 10 times, this means this gap size makes up for 30 NAs in total. This can be a good indicator of the overall impact of a gapsize.

    ranked_by

    Should the results be sorted according to the number of occurrence or total resulting NAs for a gapsize. Total resulting NAs are calculated by (total = number occurrence x gap size).

    • "occurrence" - Sorting by 'number of occurrence' of a gap size

    • "total" - Sorting by 'total resulting NAs' of a gap size

    The default setting is "occurrence".

    color_occurrence

    Defines the Color for the Bars of 'number of occurrence'.

    color_total

    Defines the color for the bars of 'total resulting NAs'.

    color_border

    Defines the color for the border of the bars.

    alpha_bars

    Alpha (transparency) value used for filling the bars.

    title

    Title of the Plot.

    subtitle

    Subtitle of the Plot.

    xlab

    Label for x-Axis.

    ylab

    Label for y-Axis.

    legend

    If TRUE a legend is added at the bottom.

    orientation

    Can be either 'vertical' or 'horizontal'. Defines if the bars are plotted vertically or horizontally. For large amounts of different gap sizes horizontal illustration is favorable (also see parameter limit).

    label_occurrence

    Defines the label assigned to 'number of occurrence' in the legend.

    label_total

    Defines the label assigned to 'total resulting NAs' in the legend.

    theme

    Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (theme_linedraw)

    Value

    The output is a ggplot2 object that can be further adjusted by using the ggplot syntax

    Details

    This plotting function can be used to visualize the length of the NA gaps (NAs in a row) in a time series. It shows a ranking of which gap sizes occur most often. This ranking can be ordered by the number occurrence of the gap sizes or by total resulting NAs for this gap size (occurrence * gap length). A NA-gap of 3 occurring 10 times means 30 total resulting NAs.

    A resulting plot can for example be described like this: a 2 NA-gap (2 NAs in a row) occurred 27 times, a 9 NA-gap (9 NAs in a row) occurred 11 times, a 27 NA-gap (27 NAs in a row) occurred 1 times, ...

    The only really needed parameter for this function is x (the univariate time series with NAs that shall be visualized). All other parameters are solely for altering the appearance of the plot.

    As long as the input is univariate and numeric, the function also takes data.frame, tibble, tsibble, zoo, xts as an input.

    The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made.

    Author

    Steffen Moritz, Sebastian Gatscha

    Examples

    # Example 1: Visualize the top gap sizes in tsNH4 (top 10 by default)
    ggplot_na_gapsize(tsNH4)
    
    
    # Example 2: Visualize the top gap sizes in tsAirgap - horizontal bars
    ggplot_na_gapsize(tsAirgap, orientation = "vertical")
    
    
    # Example 3: Same as example 1, just written with pipe operator
    tsNH4 %>% ggplot_na_gapsize()
    
    
    # Example 4: Visualize the top 20 gap sizes in tsNH4
    ggplot_na_gapsize(tsNH4, limit = 20)
    
    
    # Example 5: Visualize top gap sizes in tsNH4 without showing total NAs
    ggplot_na_gapsize(tsNH4, limit = 20, include_total = FALSE)
    
    
    # Example 6: Visualize top gap sizes in tsNH4 but ordered by total NAs
    # (total = occurrence * gap length)
    ggplot_na_gapsize(tsNH4, limit = 20, ranked_by = "total")
    
    
    # Example 7: Visualize top gap sizes in tsNH4 - different theme
    # Plot adjustments via ggplot_na_gapsize function parameters
    ggplot_na_gapsize(tsNH4, theme = ggplot2::theme_classic())
    
    
    # Example 8: Visualize top gap sizes in tsNH4 - title, subtitle in center
    # Plot adjustments via ggplot2 syntax
    ggplot_na_gapsize(tsNH4) +
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) +
      ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5))
    
    
    # Example 9: Visualize top gap sizes in tsNH4 - title in center, no subtitle
    # Plot adjustments via ggplot2 syntax and function parameters
    ggplot_na_gapsize(tsNH4, subtitle = NULL) +
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
    
    
    # Example 10: Top gap sizes in tsNH4 - legend on the right and color change
    # Plot adjustments via ggplot2 syntax and function parameters
    ggplot_na_gapsize(tsNH4, color_total = "grey") +
      ggplot2::theme(legend.position = "right")
    
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/ggplot_na_gapsize2.html ================================================ Bubble Plot to Visualize Total NA Count of NA gap sizes — ggplot_na_gapsize2 • imputeTS

    Visualize the total NA count (gap size * occurrence) for the existing gaps sizes (NAs in a row).

    ggplot_na_gapsize2(
      x,
      colors_bubbles = c("#FCFBFF", "#EFEEFA", "#DDDAEF", "#C8C3E2", "#B1AAD4", "#9A8FC4",
        "#8273B5", "#6B56A7", "#553695", "#3D1778"),
      color_border = "black",
      alpha_bubbles = 0.4,
      labels_bubbles = "none",
      size_bubbles = 25,
      min_totals = NULL,
      min_occurrence = NULL,
      min_gapsize = NULL,
      max_gapsize = NULL,
      title = "Gap Size Analysis",
      subtitle = "Total NA counts for different gapsizes",
      xlab = "Gapsize",
      ylab = "Number occurrence",
      legend = TRUE,
      legend_breaks = 4,
      legend_title = "Total NAs",
      legend_position = "right",
      legend_point_sizes = "default",
      theme = ggplot2::theme_linedraw()
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.

    colors_bubbles

    Choose a color gradient that encodes lower to higher total NA counts. Color codes can be given as vector. Using color palettes from colorspace, grDevices, RColorBrewer or other packages is useful here. E.g. grDevices::heat.colors(10) would be a possible input.

    color_border

    Color for the border of the bubbles.

    alpha_bubbles

    Alpha (transparency) value used for filling the bubbles.

    labels_bubbles

    Should labels be added to the individual bubbles inside the plot. For many datasets there will be overplotting issues once labels are added. In these cases using the min_gapsize, min_totals or min_occurrence options might be useful to only display the most relevant gap sizes.

    You can choose between these labels to be added:

    • "none" - No label gets added to the bubbles (default choice)

    • "gap" - Adds a label displaying the gap size belonging to the respective bubble

    • "total" - Adds a label displaying the total NA count for the respective bubble

    • "gap-occurrence" - Adds a label displaying the respective gap size and number of its occurrence

    The default setting is "none".

    size_bubbles

    Allows to scale the size of the bubbles. Some experimenting with this parameter might be needed to get a good visualization for your specific dataset.

    min_totals

    Only print bubbles for gap sizes that account for at least min_totals NAs in the time series.

    min_occurrence

    Only print bubbles for gap sizes that occur at least min_occurrence times in the time series.

    min_gapsize

    Only show gap sizes larger than min_gapsize. Together with max_gapsize enables zooming into in certain regions of interest.

    max_gapsize

    Only show gapsizes smaller than max_gapsize. Together with min_gapsize enables zooming into in certain regions of interest.

    title

    Title of the Plot.

    subtitle

    Subtitle of the Plot.

    xlab

    Label for x-Axis.

    ylab

    Label for y-Axis.

    legend

    If TRUE a legend is added on the right side

    legend_breaks

    Number of displayed breaks / labels in the legend. Needs an integer giving the desired number of breaks as input. Breakpoints are internally calculated by R's pretty() function, which can also lead to values slightly smaller or larger than the desired number.

    legend_title

    Defines the title of the legend.

    legend_position

    Defines position of the legend. Choose either 'bottom', right', 'left' or 'top'.

    legend_point_sizes

    Defines the size of the symbols representing the total NA bubbles in the legend.

    You can choose between "default", "actual" or a custom vector of sizes.

    • "default" - Scales the points in the legend to symbolically resemble the size differences (default choice)

    • "actual" - Scales the points in the legend according to their actual size in the plot

    Since these two options are not be always sufficient, a custom vector of sizes can be used as input. This would look like this: c(4,5,6,7). Be aware, that the length of this vector must match the number of breakpoints (can be adjusted with legend_breaks).

    theme

    Set a theme for ggplot2. Default is ggplot2::theme_linedraw(). (theme_linedraw)

    Value

    The output is a ggplot2 object that can be further adjusted by using the ggplot syntax

    Details

    This function visualizes total NA counts by individual gap size (consecutive NAs) in a time series. The bubble plot makes it easy to see which gap sizes account for most of the NAs in the series. The size and color of the bubbles represent the total number of NAs a given gap size accounts for.

    Total NAs for a gap size are calculated as follows: total NAs = occurrence * gap length

    For example, interpret a bubble for gap size 2 as follows: a 2-NA gap (two NAs in a row) occurred 27 times in the time series and thus accounts for 54 total NAs.

    On the x-axis, the different gap sizes are plotted in increasing order. The y-axis shows the occurrence count of these gap sizes in the time series.

    The plot is useful for investigating possible root causes of the missing data. It can indicate whether the missing data are random or whether there are patterns of interest.

    Depending on the input time series, there might be too much information in the plot, leading to overplotting. In these cases, use the parameters min_totals, min_occurrence, and min_gapsize to display only the information of interest.

    The only required parameter is x (the univariate time series with NAs to visualize). All other parameters alter the appearance of the plot.

    As long as the input is univariate and numeric, the function also accepts data.frame, tibble, tsibble, zoo, or xts input.

    The plot can be adjusted via function parameters. For more complex adjustments, you can modify the result using ggplot2 syntax, since the function returns a ggplot2 object. See the Examples for typical adjustments.

    Author

    Steffen Moritz

    Examples

    # Example 1: Visualize total NA counts in tsNH4
    ggplot_na_gapsize2(tsNH4)
    
    
    # Example 2: Visualize total NA counts in tsNH4, different color gradient
    ggplot_na_gapsize2(tsNH4, colors_bubbles = rev(grDevices::heat.colors(10)))
    
    
    # Example 3: Same as example 1, just written with pipe operator
    tsNH4 %>% ggplot_na_gapsize2()
    
    
    # Example 4: Visualize total NA counts in tsHeating
    # Limited to gap sizes that account for a total of > 600 NAs
    ggplot_na_gapsize2(tsHeating, min_totals = 600)
    
    
    # Example 5: Visualize total NA counts in tsNH4 - no legend
    ggplot_na_gapsize2(tsNH4, legend = FALSE)
    
    
    # Example 6: Visualize total NA counts in tsAirgap - increased bubble size
    ggplot_na_gapsize2(tsAirgap, size_bubbles = 35)
    
    
    # Example 7: Visualize total NA counts in tsNH4
    # Plot adjustments via ggplot_na_gapsize2 function parameters
    ggplot_na_gapsize2(tsNH4, theme = ggplot2::theme_classic())
    
    
    # Example 8: Visualize total NA counts in tsNH4 - title, subtitle in center
    # Plot adjustments via ggplot2 syntax
    ggplot_na_gapsize2(tsNH4) +
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) +
      ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5))
    
    
    # Example 9: Visualize total NA counts in tsNH4 - title in center, no subtitle
    # Plot adjustments via ggplot2 syntax and function parameters
    ggplot_na_gapsize2(tsNH4, subtitle = NULL) +
      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
    
    
    # Example 10: Total NA counts in tsNH4 - legend on the bottom and color change
    # Plot adjustments via ggplot2 syntax and function parameters
    ggplot_na_gapsize2(tsNH4, colors_bubbles = grDevices::heat.colors(10)) +
      ggplot2::theme(legend.position = "bottom")
    
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/ggplot_na_imputations.html ================================================ Line Plot to Visualize Imputed Values — ggplot_na_imputations • imputeTS

    Visualize the imputed values in a time series.

    ggplot_na_imputations(
      x_with_na,
      x_with_imputations,
      x_with_truth = NULL,
      x_axis_labels = NULL,
      title = "Imputed Values",
      subtitle = "Visualization of missing value replacements",
      xlab = "Time",
      ylab = "Value",
      color_points = "steelblue",
      color_imputations = "indianred",
      color_truth = "seagreen3",
      color_lines = "lightslategray",
      shape_points = 16,
      shape_imputations = 18,
      shape_truth = 16,
      size_points = 1.5,
      size_imputations = 2.5,
      size_truth = 1.5,
      width_lines = 0.5,
      linetype = "solid",
      connect_na = TRUE,
      legend = TRUE,
      legend_size = 5,
      label_known = "known values",
      label_imputations = "imputed values",
      label_truth = "ground truth",
      theme = ggplot2::theme_linedraw()
    )

    Arguments

    x_with_na

    Numeric Vector or Time Series (ts) object with NAs before imputation. This parameter and x_with_imputation shave to be set. The rest of the parameters are mostly needed for adjusting the plot appearance.

    x_with_imputations

    Numeric Vector or Time Series (ts) object with NAs replaced by imputed values. This parameter and x_with_imputation shave to be set.The rest of the parameters are mostly needed for adjusting the plot appearance.

    x_with_truth

    Numeric Vector or Time Series (ts) object with the real values (optional parameter). If the ground truth is known (e.g. in experiments where the missing values were artificially added) it can be displayed in the plot with this parameter. Default is NULL (ground truth not known).

    x_axis_labels

    For adding specific x-axis labels. Takes a vector of Date or POSIXct objects as an input (needs the same length as x_with_na). The Default (NULL) uses the observation numbers as x-axis tick labels.

    title

    Title of the Plot.

    subtitle

    Subtitle of the Plot.

    xlab

    Label for x-Axis.

    ylab

    Label for y-Axis.

    color_points

    Color for the Symbols/Points of the non-NA Observations.

    color_imputations

    Color for the Symbols/Points of the Imputed Values.

    color_truth

    Color for the Symbols/Points of the NA value Ground Truth (only relevant when x_with_truth available).

    color_lines

    Color for the Lines connecting the Observations/Points.

    shape_points

    Shape for the Symbols/Points of the non-NA observations. See https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference.

    shape_imputations

    Shape for the Symbols/Points of the imputed values. See https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference.

    shape_truth

    Shape for the Symbols/Points of the NA value Ground Truth (only relevant when x_with_truth available).

    size_points

    Size for the Symbols/Points of the non-NA Observations.

    size_imputations

    Size for the Symbols/Points of the Imputed Values.

    size_truth

    Size for the Symbols/Points of the NA value Ground Truth (only relevant when x_with_truth available).

    width_lines

    Width for the Lines connecting the Observations/Points.

    linetype

    Linetype for the Lines connecting the Observations/Points.

    connect_na

    If TRUE the Imputations are connected to the non-NA observations in the plot. Otherwise there are no connecting lines between symbols in NA areas.

    legend

    If TRUE a Legend is added at the bottom.

    legend_size

    Size of the Symbols used in the Legend.

    label_known

    Legend label for the non-NA Observations.

    label_imputations

    Legend label for the Imputed Values.

    label_truth

    Legend label for the Ground Truth of the NA values.

    theme

    Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (theme_linedraw)

    Details

    This plot can be used, to visualize imputed values for a time series. Imputed values (filled NA gaps) are shown in a different color than the other values. If real values (ground truth) for the NA gaps are known, they can be optionally added in a different color.

    The only really needed parameters for this function are x_with_na (the time series with NAs before imputation) and x_with_imputations (the time series without NAs after imputation). All other parameters are msotly for altering the appearance of the plot.

    As long as the input is univariate and numeric the function also takes data.frame, tibble, tsibble, zoo, xts as an input.

    The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made.

    Author

    Steffen Moritz, Sebastian Gatscha

    Examples

    # Example 1: Visualize imputation by na_mean
    imp_mean <- na_mean(tsAirgap)
    ggplot_na_imputations(tsAirgap, imp_mean)
    
    
    
    # Example 2: Visualize imputation by na_locf and added ground truth 
    imp_locf <- na_locf(tsAirgap)
    ggplot_na_imputations(x_with_na = tsAirgap, 
                          x_with_imputations = imp_locf,
                          x_with_truth = tsAirgapComplete
                          )
    
    
    
    # Example 3: Visualize imputation by na_kalman
    imp_kalman <- na_kalman(tsAirgap)
    ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_kalman)
    
    
    
    # Example 4: Same as example 1, just written with pipe operator
    tsAirgap %>%
      na_mean() %>%
      ggplot_na_imputations(x_with_na = tsAirgap)
    
      
    
    # Example 5: Visualize imputation by na_seadec - different color for imputed points
    # Plot adjustments via ggplot_na_imputations function parameters
    imp_seadec <- na_seadec(tsAirgap)
    ggplot_na_imputations(x_with_na = tsAirgap, 
                          x_with_imputations = imp_seadec,
                          color_imputations = "gold")
    
    
    
    # Example 6: Visualize imputation - different theme, point size imputations
    # Plot adjustments via ggplot_na_imputations function parameters
    imp_seadec <- na_seadec(tsAirgap)
    ggplot_na_imputations(x_with_na = tsAirgap, 
                          x_with_imputations = imp_seadec,
                          theme = ggplot2::theme_classic(),
                          size_imputations = 5)
    
    
                     
    # Example 7: Visualize imputation - title, subtitle in center
    # Plot adjustments via ggplot2 syntax
    imp_seadec <- na_seadec(tsAirgap)
    ggplot_na_imputations(x_with_na = tsAirgap,  x_with_imputations = imp_seadec) + 
        ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) +
        ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5))   
    
    
    
    # Example 8: Visualize imputation - title in center, no subtitle
    # Plot adjustments via ggplot2 syntax and function parameters
    imp_mean <- na_mean(tsAirgap)
    ggplot_na_imputations(x_with_na = tsAirgap,  
                          x_with_imputations = imp_mean,
                          subtitle = NULL) +
         ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
    
      
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/ggplot_na_intervals.html ================================================ Discontinued - Use ggplot_na_distribution2 instead. — ggplot_na_intervals • imputeTS

    plotNA.distributionBar was replaced by ggplot_na_distribution2. The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions.

    ggplot_na_intervals(x, ...)

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/ggplot_na_level.html ================================================ Dotplot of Value Distribution directly before/after NAs — ggplot_na_level • imputeTS

    Visualize the distribution of values directly before/after NAs via a dotplot. Useful to determine if missing values appear more often when a certain threshold level is reached.

    ggplot_na_level(
      x,
      number_bins = ifelse(length(x)/10 < 30, 30, length(x)/10),
      color_before = "steelblue",
      color_after = "yellowgreen",
      color_regular = "azure2",
      title = "Before/After Analysis",
      subtitle = "Values before and after NAs",
      xlab = NULL,
      ylab = NULL,
      legend = TRUE,
      legend_title = "",
      orientation = "vertical",
      label_before = "before",
      label_after = "after",
      label_regular = "regular",
      theme = ggplot2::theme_linedraw()
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.

    number_bins

    Number of bins of stacked observations to be created. Default is length of time series divided by ten - but with a minimum of 30 bins.

    color_before

    Color for the dots representing observations directly before NA gaps.

    color_after

    Color for the dots representing observations directly after NA gaps.

    color_regular

    Color for the dots representing all values that are not next to NA observations.

    title

    Title of the plot (NULL for deactivating title).

    subtitle

    Subtitle of the plot (NULL for deactivating subtitle).

    xlab

    Label for x-Axis.

    ylab

    Label for y-Axis.

    legend

    If TRUE a legend is added at the bottom.

    legend_title

    Title for the legend.

    orientation

    Can be either 'vertical' or 'horizontal'. Defines if the plot is oriented vertically or horizontally.

    label_before

    Defines the legend label assigned to the observations directly before NAs.

    label_after

    Defines the legend label assigned to the observations directly after NAs.

    label_regular

    Defines the legend label assigned to the observations, that are not next to NA values.

    theme

    Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (theme_linedraw)

    Details

    This function visualizes the distribution of missing values directly before/after NAs via a dotplot. This is useful to determine if missing values appear more often when near to a certain value level.

    In a geom_dotplot each dot represents one observation in the time series. It can be directly seen how many values are stacked into a bin (a value range).

    The ggplot_na_level plot makes use of this and additionally colors observations before and after NAs differently.

    The visualization of the before/after NA observations in a bin in comparison to the regular observations can provide information about the root cause of the missing values. It also can provide indications, about the missing data mechanism (MCAR, MAR, MNAR).

    By looking at this plot it can be seen whether the NAs appear rather randomly after some values in the overall distribution or if e.g. it can be said NAs more likely appear after high values.

    It could, for example be the case, that a sensor can't measure values above 100 degree and always outputs NA values once the temperature reaches 100 degree. With this plot, it can be realized, that NAs in the next value always occur when the temperature is close to 100 degree.

    Thus, unusually high numbers of dots of before/after NA observations in a bin (in comparison the amount of dots of other observations in this bin) should draw the users' attention.

    The advantage of the dotplot of ggplot_na_level over the violin plots of ggplot_na_level2 is that each observation in the time series is really displayed as a dot in the dotplot. For the user this can feel more intuitive. Especially, for very short time series the violins/boxplots and the summary statistics they provide are not so meaningful anymore. On the other hand, the ggplot_na_level is not a good choice for large time series. Drawing a visible dot for each observation comes to its limits, when the time series is larger than 500 observations. Also, while our assessment of distributions and anomalies usually works adequate on small amounts of data, we often struggle with large amounts of data. Here the violin/boxplot combination of ggplot_na_level2 is a great help.

    The only really needed parameter for this function is x (the univariate time series that shall be visualized). All other parameters are solely for altering the appearance of the plot.

    As long as the input is univariate and numeric, the function also takes data.frame, tibble, tsibble, zoo, xts as an input.

    The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made.

    See also

    Author

    Steffen Moritz

    Examples

    # Example 1: Visualize the before/after NA distributions x <- stats::ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7)) ggplot_na_level(x)
    # Example 2: Visualize the before/after in subset of tsNH4 time series, more bins ggplot_na_level(tsNH4[1:500], number_bins = 100)
    # Example 3: Same as example 1, just written with pipe operator x <- ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7)) x %>% ggplot_na_level()
    # Example 4: Visualize the before/after NA in tsAirgap - different color for violins # Plot adjustments via ggplot_na_level function parameters ggplot_na_level(tsAirgap, color_after = "green")
    # Example 5: Visualize before/after NA in tsAirgap - different theme and orientation # Plot adjustments via ggplot_na_level function parameters ggplot_na_level(tsAirgap, theme = ggplot2::theme_classic() , orientation = "horizontal")
    # Example 6: Visualize before/after NA in tsNH4 - title, subtitle in center # Plot adjustments via ggplot2 syntax ggplot_na_level(tsAirgap) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5))
    # Example 7: Visualize before/after NA in tsAirgap - title in center, no subtitle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_level(tsAirgap, subtitle = NULL, orientation = "horizontal") + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
    # Example 8: Visualize before/after NA in tsAirgap - y-axis texts with angle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_level(tsAirgap, color_regular = "grey") + ggplot2::theme(axis.text.y = ggplot2::element_text(angle = 60, hjust = 1))

    Site built with pkgdown 1.6.1.

    ================================================ FILE: docs/reference/ggplot_na_level2.html ================================================ Parallel Violin Plots of Value Distributions directly before/after NAs — ggplot_na_level2 • imputeTS

    Visualize the distribution of values directly before/after NAs via violin plots. Useful to determine if missing values appear more often when a certain threshold level is reached.

    ggplot_na_level2(
      x,
      inside_information = "boxplot",
      color_before = "pink3",
      color_after = "pink3",
      color_source = "steelblue",
      color_inside = "black",
      alpha_violin = 0.5,
      alpha_inside = 0.9,
      title = "Before/After Analysis",
      subtitle = "Level of values occurring directly before and after NAs",
      xlab = "",
      ylab = "Value",
      legend = FALSE,
      orientation = "vertical",
      label_before = "before",
      label_after = "after",
      label_source = "source",
      add_n_label = T,
      theme = ggplot2::theme_linedraw()
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.

    inside_information

    Defines what is inside the violin as an additional distribution visualization. Accepts the following input:

    • "boxplot" - Adds a boxplot inside the violins (default choice)

    • "points" - Adds jittered points inside the violins

    • "none" - Adds nothing inside the violins

    Beware, though using jitter option "points" can lead to overlays for larger time series.

    color_before

    Color to fill the violin representing observations directly before NA gaps.

    color_after

    Color to fill the violin representing observations directly after NA gaps.

    color_source

    Color to fill the violin representing the distribution of all non-NA values of a time series.

    color_inside

    Color used for the inside information (color of boxplot border or color of points).

    alpha_violin

    Alpha ((transparency) value used for the violin.

    alpha_inside

    Alpha (transparency) value used for the inside information in the violin (boxplot, points).

    title

    Title of the plot (NULL for deactivating title).

    subtitle

    Subtitle of the plot (NULL for deactivating subtitle).

    xlab

    Label for x-Axis.

    ylab

    Label for y-Axis.

    legend

    If TRUE a legend is added at the bottom.

    orientation

    Can be either 'vertical' or 'horizontal'. Defines if the violin plot is oriented vertically or horizontally.

    label_before

    Defines the label assigned to the violin for values directly before NAs.

    label_after

    Defines the label assigned to the violin for values directly after NAs.

    label_source

    Defines the label assigned to the violin for the distribution of all values.

    add_n_label

    Whether to automatically additionally add a n-value (e.g. n = 100) to the labels as an indication how many observations are represented by the violins.

    theme

    Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (theme_linedraw)

    Details

    This function visualizes the distribution of missing values directly before/after NAs via violin plots. This is useful to determine if missing values appear more often when near to a certain value level.

    As described in geom_violin: 'A violin plot is a compact display of a continuous distribution. A violin plot is a mirrored density plot displayed in the same way as a boxplot.'

    The visualization of the before/after NA distributions in comparison to the overall distribution can provide information about the root cause of the missing values. It also can provide indications, about the missing data mechanism (MCAR,MAR, MNAR).

    The default plot consists of three violins/boplots combinations - one for all values directly before NAs, one for all values directly after NAs and one for the overall distribution of all non-NA values.

    By looking at these plots it can be seen whether the NAs appear rather randomly after some values in the overall distribution or if e.g. it can be said NAs more likely appear after high values.

    It could, for example be the case, that a sensor can't measure values above 100 degree and always outputs NA values once the temperature reaches 100 degree. With these plots it could be realized, that NAs in the next value always occur when the temperature is close to 100 degree.

    Some more technical implementation details:

    The middle violin with the distribution of all non-NA observations also includes the values directly before/after the NAs.

    Only the values directly before and after the NA gap are used for the before/after violins.

    For the example series 6, 2, NA, NA, NA, 3, 6 this would mean:

    • The 2 value goes into the before distribution

    • The 3 value goes into the after distribution

    • Both 6 are not in before or after, since only values directly before or after the gaps are considered

    • No extra values added to before/after as representatives for the middle NAs

    So the source/overall distribution for this series would be {6, 2, 3, 6} the before {2} and after {6}.

    Of course the overall plot only makes sense with a longer time series with more missing values.

    The only really needed parameter for this function is x (the univariate time series that shall be visualized). All other parameters are solely for altering the appearance of the plot.

    As long as the input is univariate and numeric, the function also takes data.frame, tibble, tsibble, zoo, xts as an input.

    The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made.

    See also

    Author

    Steffen Moritz

    Examples

    # Example 1: Visualize the before/after NA distributions x <- stats::ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7)) ggplot_na_level2(x)
    # Example 2: Visualize the before/after NA distributions in tsNH4 time series ggplot_na_level2(tsNH4)
    # Example 3: Same as example 1, just written with pipe operator x <- ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7)) x %>% ggplot_na_level2()
    # Example 4: Visualize the before/after NA in tsAirgap - different color for violins # Plot adjustments via ggplot_na_level2 function parameters ggplot_na_level2(tsAirgap, color_after = "green")
    # Example 5: Visualize before/after NA in tsAirgap - different theme # Plot adjustments via ggplot_na_level2 function parameters ggplot_na_level2(tsAirgap, theme = ggplot2::theme_classic())
    # Example 6: Visualize before/after NA in tsNH4 - title, subtitle in center # Plot adjustments via ggplot2 syntax ggplot_na_level2(tsAirgap) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5))
    # Example 7: Visualize before/after NA in tsAirgap - title in center, no subtitle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_level2(tsAirgap, subtitle = NULL) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
    # Example 8: Visualize before/after NA in tsAirgap - y-axis texts with angle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_level2(tsAirgap, color_source = "grey") + ggplot2::theme(axis.text.y = ggplot2::element_text(angle = 60, hjust = 1))

    Site built with pkgdown 1.6.1.

    ================================================ FILE: docs/reference/ggplot_na_pattern.html ================================================ Visualize Patterns in NA occurrences — ggplot_na_pattern • imputeTS

    Visualize Patterns in NA occurrences. E.g. weakly, ...,..

    ggplot_na_pattern(
      x,
      limit = 10,
      include_total = TRUE,
      ranked_by = "occurrence",
      color_occurrence = "indianred",
      color_total = "steelblue",
      title = "Occurrence of gap sizes",
      subtitle = "Gap sizes (NAs in a row) ordered by most common",
      xlab = NULL,
      ylab = "Number occurrence",
      legend = TRUE,
      orientation = "horizontal",
      label_occurrence = "Number occurrence gapsize",
      label_total = "Resulting NAs for gapsize",
      theme = ggplot2::theme_linedraw()
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.

    limit

    Specifies how many of the most common gap sizes are shown in the plot.Default is 10. So only the 10 most often occurring gapsizes will be shown. If more or all present gap sizes should be displayed, the limit needs to be increased. Since this might add a lot of additional data, having parameter orientation set to 'horizontal' avoids overlaps in the axis labels.

    include_total

    When set to TRUE the total NA count for a gapsize is included in the plot (total = number occurrence x gap size). E.g. if a gapsize of 3 occurs 10 times, this means this gap size makes up for 30 NAs in total. This can be a good indicator of the overall impact of a gapsize.

    ranked_by

    Should the results be sorted according to the number of occurrence or total resulting NAs for a gapsize. Total resulting NAs are calculated by (total = number occurrence x gap size).

    • "occurrence" - Sorting by 'number of occurrence' of a gap size

    • "total" - Sorting by 'total resulting NAs' of a gap size

    The default setting is "occurrence".

    color_occurrence

    Defines the Color for the Bars of 'number of occurrence'.

    color_total

    Defines the color for the bars of 'total resulting NAs'.

    title

    Title of the Plot.

    subtitle

    Subtitle of the Plot.

    xlab

    Label for x-Axis.

    ylab

    Label for y-Axis.

    legend

    If TRUE a legend is added at the bottom.

    orientation

    Can be either 'vertical' or 'horizontal'. Defines if the bars are plotted vertically or horizontally. For large amounts of different gap sizes horizontal illustration is favorable (also see parameter limit).

    label_occurrence

    Defines the label assigned to 'number of occurrence' in the legend.

    label_total

    Defines the label assigned to 'total resulting NAs' in the legend.

    theme

    Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (theme_linedraw)

    Value

    The output is a ggplot2 object that can be further adjusted by using the ggplot syntax

    Details

    This plotting function can be used to visualize the length of the NA gaps (NAs in a row) in a time series. It shows a ranking of which gap sizes occur most often. This ranking can be ordered by the number occurrence of the gap sizes or by total resulting NAs for this gap size (occurrence * gap length). A NA-gap of 3 occuring 10 times means 30 total resulting NAs.

    A resulting plot can for example be described like this: a 2 NA-gap (2 NAs in a row) occurred 27 times, a 9 NA-gap (9 NAs in a row) occurred 11 times, a 27 NA-gap (27 NAs in a row) occurred 1 times, ...

    The only really needed parameter for this function is x (the univariate time series with NAs that shall be visualized). All other parameters are solely for altering the appearance of the plot.

    As long as the input is univariate and numeric the function also takes data.frame, tibble, tsibble, zoo, xts as an input.

    The plot can be adjusted to your needs via the function parameters. Additionally for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made.

    See also

    Author

    Steffen Moritz, Sebastian Gatscha

    Examples

    # Example 1: Visualize the top gap sizes in tsNH4 (top 10 by default) ggplot_na_gapsize(tsNH4)
    # Example 2: Visualize the top gap sizes in tsAirgap - horizontal bars ggplot_na_gapsize(tsAirgap, orientation = "vertical")
    # Example 3: Same as example 1, just written with pipe operator tsNH4 %>% ggplot_na_gapsize()
    # Example 4: Visualize the top 20 gap sizes in tsNH4 ggplot_na_gapsize(tsNH4, limit = 20)
    # Example 5: Visualize top gap sizes in tsNH4 without showing total NAs ggplot_na_gapsize(tsNH4, limit = 20, include_total = FALSE)
    # Example 6: Visualize top gap sizes in tsNH4 but ordered by total NAs # (total = occurrence * gap length) ggplot_na_gapsize(tsNH4, limit = 20, ranked_by = "total")
    # Example 7: Visualize top gap sizes in tsNH4 - different theme # Plot adjustments via ggplot_na_gapsize function parameters ggplot_na_gapsize(tsNH4, theme = ggplot2::theme_classic())
    # Example 8: Visualize top gap sizes in tsNH4 - title, subtitle in center # Plot adjustments via ggplot2 syntax ggplot_na_gapsize(tsNH4) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5))
    # Example 9: Visualize top gap sizes in tsNH4 - title in center, no subtitle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_gapsize(tsNH4, subtitle = NULL) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
    # Example 10: Top gap sizes in tsNH4 - legend on the right and color change # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_gapsize(tsNH4, color_total = "grey") + ggplot2::theme(legend.position = "right")

    Site built with pkgdown 1.6.1.

    ================================================ FILE: docs/reference/imputeTS-package.html ================================================ imputeTS: Time Series Missing Value Imputation — imputeTS-package • imputeTS

    Imputation (replacement) of missing values in univariate time series. Offers several imputation functions and missing data plots. Available imputation algorithms include: 'Mean', 'LOCF', 'Interpolation', 'Moving Average', 'Seasonal Decomposition', 'Kalman Smoothing on Structural Time Series models', 'Kalman Smoothing on ARIMA models'. Published in Moritz and Bartz-Beielstein (2017) doi:10.32614/RJ-2017-009 .

    The imputeTS package is a collection of algorithms and tools for univariate time series imputation.

    Details

    The imputeTS package specializes on (univariate) time series imputation. It offers several different imputation algorithm implementations. Beyond the imputation algorithms the package also provides plotting and printing functions of missing data statistics.

    The package is easy to use:

    • To impute (fill all missing values) in a time series x, run:
      na_interpolation(x)

    • To plot missing data statistics for a time series x, run:
      ggplot_na_distribution(x)

    • To print missing data statistics for a time series x, run:
      statsNA(x)

    Every other imputation function (starting with na_'algorithm name') and plotting function (starting with plotNA_'plot name') work the same way as in this example.

    References

    Moritz, Steffen, and Thomas Bartz-Beielstein. "imputeTS: Time Series Missing Value Imputation in R." R Journal 9.1 (2017). doi:10.32614/RJ-2017-009.

    Author

    Maintainer: Steffen Moritz steffen.moritz10@gmail.com (ORCID) [copyright holder]

    Authors:

    Other contributors:

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/imputeTS.html ================================================ ================================================ FILE: docs/reference/index.html ================================================ Package index • imputeTS

    All functions

    ggplot_na_distribution()

    Line Plot to Visualize the Distribution of Missing Values

    ggplot_na_distribution2()

    Stacked Bar Plot to Visualize Missing Values per Time Interval

    ggplot_na_gapsize()

    Bar Plot to Visualize Occurrences of Different NA Gap Sizes

    ggplot_na_gapsize2()

    Bubble Plot to Visualize Total NA Count of NA gap sizes

    ggplot_na_imputations()

    Line Plot to Visualize Imputed Values

    na_interpolation()

    Missing Value Imputation by Interpolation

    na_kalman()

    Missing Value Imputation by Kalman Smoothing and State Space Models

    na_locf()

    Missing Value Imputation by Last Observation Carried Forward

    na_ma()

    Missing Value Imputation by Weighted Moving Average

    na_mean()

    Missing Value Imputation by Mean Value

    na_random()

    Missing Value Imputation by Random Sample

    na_remove()

    Remove Missing Values

    na_replace()

    Replace Missing Values by a Defined Value

    na_seadec()

    Seasonally Decomposed Missing Value Imputation

    na_seasplit()

    Seasonally Splitted Missing Value Imputation

    statsNA()

    Print Statistics about Missing Values

    tsAirgap

    Time series of monthly airline passengers (with NAs)

    tsAirgapComplete

    Time series of monthly airline passengers (complete)

    tsHeating

    Time series of a heating systems supply temperature (with NAs)

    tsHeatingComplete

    Time series of a heating systems supply temperature (complete)

    tsNH4

    Time series of NH4 concentration in a wastewater system (with NAs)

    tsNH4Complete

    Time series of NH4 concentration in a wastewater system (complete)

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.interpolation.html ================================================ Deprecated use na_interpolation instead. — na.interpolation • imputeTS

    na.interpolation is replaced by na_interpolation. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.interpolation(x, option = "linear", maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    option

    Algorithm to be used. Accepts the following input:

    • "linear" - for linear interpolation using approx (default choice)

    • "spline" - for spline interpolation using spline

    • "stine" - for Stineman interpolation using stinterp

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    ...

    Additional parameters to be passed through to approx or spline interpolation functions

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.kalman.html ================================================ Deprecated use na_kalman instead. — na.kalman • imputeTS

    na.kalman is replaced by na_kalman. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.kalman(x, model = "StructTS", smooth = TRUE, nit = -1, maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    model

    Model to be used. With this parameter the State Space Model (on which KalmanSmooth is performed) can be chosen. Accepts the following input:

    • "StructTS" - For using a structural model fitted by maximum likelihood (using StructTS) (default choice)

    • "auto.arima" - For using the state space representation of arima model (using auto.arima)

    For both auto.arima and StructTS additional parameters for model building can be given with the ... parameter

    Additionally it is also possible to use a user created state space model (See code Example 5). This state space model could for example be obtained from another R package for structural time series modeling. Furthermore providing the state space representation of a arima model from arima is also possible. But it is important to note, that user created state space models must meet the requirements specified under KalmanLike. This means the user supplied state space model has to be in form of a list with at least components T, Z, h , V, a, P, Pn. (more details under KalmanLike)

    smooth

    if TRUE - KalmanSmooth is used for estimation, if FALSE - KalmanRun is used. Since KalmanRun is often considered extrapolation KalmanSmooth is usually the better choice for imputation.

    nit

    Parameter from Kalman Filtering (see KalmanLike). Usually no need to change from default.

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    ...

    Additional parameters to be passed through to the functions that build the State Space Models (StructTS or auto.arima).

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.locf.html ================================================ Deprecated use na_locf instead. — na.locf • imputeTS

    na.locf is replaced by na_locf. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.locf(x, option = "locf", na.remaining = "rev", maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    option

    Algorithm to be used. Accepts the following input:

    • "locf" - for Last Observation Carried Forward (default choice)

    • "nocb" - for Next Observation Carried Backward

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.ma.html ================================================ Deprecated use na_ma instead. — na.ma • imputeTS

    na.ma is replaced by na_ma. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.ma(x, k = 4, weighting = "exponential", maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    k

    integer width of the moving average window. Expands to both sides of the center element e.g. k=2 means 4 observations (2 left, 2 right) are taken into account. If all observations in the current window are NA, the window size is automatically increased until there are at least 2 non-NA values present.

    weighting

    Weighting to be used. Accepts the following input:

    • "simple" - Simple Moving Average (SMA)

    • "linear" - Linear Weighted Moving Average (LWMA)

    • "exponential" - Exponential Weighted Moving Average (EWMA) (default choice)

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.mean.html ================================================ Deprecated use na_mean instead. — na.mean • imputeTS

    na.mean is replaced by na_mean. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.mean(x, option = "mean", maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    option

    Algorithm to be used. Accepts the following input:

    • "mean" - take the mean for imputation (default choice)

    • "median" - take the median for imputation

    • "mode" - take the mode for imputation

    • "harmonic" - take the harmonic mean

    • "geometric" - take the geometric mean

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.random.html ================================================ Deprecated use na_random instead. — na.random • imputeTS

    na.random is replaced by na_random. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.random(x, lower_bound = NULL, upper_bound = NULL, maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    lower_bound

    Lower bound for the random samples. If nothing or NULL is set min(x) will be used.

    upper_bound

    Upper bound for the random samples. If nothing or NULL is set man(x) will be used.

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.remove.html ================================================ Deprecated use na_remove instead. — na.remove • imputeTS

    na.remove is replaced by na_remove. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.remove(x, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.replace.html ================================================ Deprecated use na_replace instead. — na.replace • imputeTS

    na.replace is replaced by na_replace. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.replace(x, fill = 0, maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    fill

    Value used to replace the missing values

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.seadec.html ================================================ Deprecated use na_seadec instead. — na.seadec • imputeTS

    na.seadec is replaced by na_seadec. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.seadec(
      x,
      algorithm = "interpolation",
      find_frequency = FALSE,
      maxgap = Inf,
      ...
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    algorithm

    Algorithm to be used after decomposition. Accepts the following input:

    • "interpolation" - Imputation by Interpolation (default choice)

    • "locf" - Imputation by Last Observation Carried Forward

    • "mean" - Imputation by Mean Value

    • "random" - Imputation by Random Sample

    • "kalman" - Imputation by Kalman Smoothing and State Space Models

    • "ma" - Imputation by Weighted Moving Average

    find_frequency

    If TRUE the algorithm will try to estimate the frequency of the time-series automatically.

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    ...

    Additional parameters for these algorithms that can be passed through. Look at na_interpolation, na_locf, na_random, na_mean for parameter options.

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na.seasplit.html ================================================ Deprecated use na_seasplit instead. — na.seasplit • imputeTS

    na.seasplit is replaced by na_seasplit. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names).

    na.seasplit(
      x,
      algorithm = "interpolation",
      find_frequency = FALSE,
      maxgap = Inf,
      ...
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    algorithm

    Algorithm to be used after splits. Accepts the following input:

    • "interpolation" - Imputation by Interpolation (default choice)

    • "locf" - Imputation by Last Observation Carried Forward

    • "mean" - Imputation by Mean Value

    • "random" - Imputation by Random Sample

    • "kalman" - Imputation by Kalman Smoothing and State Space Models

    • "ma" - Imputation by Weighted Moving Average

    find_frequency

    If TRUE the algorithm will try to estimate the frequency of the time-series automatically.

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    ...

    Additional parameters for these algorithms that can be passed through. Look at na_interpolation, na_locf, na_random, na_mean for parameter options.

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_interpolation.html ================================================ Missing Value Imputation by Interpolation — na_interpolation • imputeTS

    Uses either linear, spline or stineman interpolation to replace missing values.

    na_interpolation(x, option = "linear", maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    option

    Algorithm to be used. Accepts the following input:

    • "linear" - for linear interpolation using approx (default choice)

    • "spline" - for spline interpolation using spline

    • "stine" - for Stineman interpolation using stinterp

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    ...

    Additional parameters to be passed through to approx or spline interpolation functions

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Details

    Missing values get replaced by values of approx, spline or stinterp interpolation.

    The na_interpolation function also supports the use of additional parameters from the respective underlying interpolation functions. While usually not really needed, it is useful to know that this advanced use is in principle possible. These additional parameters are not specified explicitly in the na_interpolation function documentation. Take a look into the documentation of the stinterp, approx and spline functions to get an overview about these additional parameters.

    An example for such a parameter is the 'method' argument of spline, which can be used to further specify the type of spline to be used. Possible values are "fmm", "natural", "periodic", "monoH.FC" and "hyman" (as can be seen in the spline documentation). The respective function call using this additional parameter would look like this: na_interpolation(x, option ="spline", method ="natural")

    Like in this example other additional detail parameters (gained from approx, spline, stinterp documentation) can be used by just including them in the na_interpolation function call. As already mentioned, these advanced possibilities for settings parameters are only helpful for specific use cases. For regular use the standard parameters provided directly in the na_interpolation documentation should be more than enough.

    References

    Johannesson, Tomas, et al. (2015). "Package stinepack".

    Author

    Steffen Moritz, Ron Hause

    Examples

    # Prerequisite: Create Time series with missing values
    x <- ts(c(2, 3, 4, 5, 6, NA, 7, 8))
    
    # Example 1: Perform linear interpolation
    na_interpolation(x)
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.0 3.0 4.0 5.0 6.0 6.5 7.0 8.0
    
    # Example 2: Perform spline interpolation
    na_interpolation(x, option = "spline")
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.00000 3.00000 4.00000 5.00000 6.00000 6.53923 7.00000 8.00000
    
    # Example 3: Perform stine interpolation
    na_interpolation(x, option = "stine")
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.0 3.0 4.0 5.0 6.0 6.5 7.0 8.0
    
    # Example 4: Perform linear interpolation, with additional parameter pass through from spline()
    # Take a look at the 'Details' section of the na_interpolation documentation 
    # for more information about advanced parameter pass through options
    na_interpolation(x, option ="spline", method ="natural")
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.00000 3.00000 4.00000 5.00000 6.00000 6.50661 7.00000 8.00000
    
    # Example 5: Same as example 1, just written with pipe operator
    x %>% na_interpolation()
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.0 3.0 4.0 5.0 6.0 6.5 7.0 8.0
    
    # Example 6: Same as example 2, just written with pipe operator
    x %>% na_interpolation(option = "spline")
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.00000 3.00000 4.00000 5.00000 6.00000 6.53923 7.00000 8.00000
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_kalman.html ================================================ Missing Value Imputation by Kalman Smoothing and State Space Models — na_kalman • imputeTS

    Uses Kalman Smoothing on structural time series models (or on the state space representation of an arima model) for imputation.

    na_kalman(x, model = "StructTS", smooth = TRUE, nit = -1, maxgap = Inf, ...)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    model

    Model to be used. With this parameter the State Space Model (on which KalmanSmooth is performed) can be chosen. Accepts the following input:

    • "StructTS" - For using a structural model fitted by maximum likelihood (using StructTS) (default choice)

    • "auto.arima" - For using the state space representation of arima model (using auto.arima)

    For both auto.arima and StructTS additional parameters for model building can be given with the ... parameter

    Additionally it is also possible to use a user created state space model (See code Example 5). This state space model could for example be obtained from another R package for structural time series modeling. Furthermore providing the state space representation of a arima model from arima is also possible. But it is important to note, that user created state space models must meet the requirements specified under KalmanLike. This means the user supplied state space model has to be in form of a list with at least components T, Z, h , V, a, P, Pn. (more details under KalmanLike)

    smooth

    if TRUE - KalmanSmooth is used for estimation, if FALSE - KalmanRun is used. Since KalmanRun is often considered extrapolation KalmanSmooth is usually the better choice for imputation.

    nit

    Parameter from Kalman Filtering (see KalmanLike). Usually no need to change from default.

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    ...

    Additional parameters to be passed through to the functions that build the State Space Models (StructTS or auto.arima).

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Details

    The KalmanSmoother used in this function is KalmanSmooth. It operates either on a Basic Structural Model obtained by StructTS or the state space representation of a ARMA model obtained by auto.arima.

    For an detailed explanation of Kalman Filtering and Space Space Models the following literature is a good starting point:

    • G. Welch, G. Bishop, An Introduction to the Kalman Filter. SIGGRAPH 2001 Course 8, 1995

    • Harvey, Andrew C. Forecasting, structural time series models and the Kalman filter. Cambridge university press, 1990

    • Grewal, Mohinder S. Kalman filtering. Springer Berlin Heidelberg, 2011

    References

    Hyndman RJ and Khandakar Y (2008). "Automatic time series forecasting: the forecast package for R". Journal of Statistical Software, 26(3).

    Author

    Steffen Moritz

    Examples

    # Example 1: Perform imputation with KalmanSmoother and state space representation of arima model
    na_kalman(tsAirgap)
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 123.5734 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 260.8078 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 312.6060 311.6160 311.0830 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 414.6160 465.0000 467.0000
    #> 1958 340.0000 318.0000 382.6912 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 452.5683 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 132.7833 119.0000 104.0000 118.0000
    #> 1950 157.1427 133.0000 130.8971 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 298.6147 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 390.9089
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 2: Perform imputation with KalmanRun and state space representation of arima model
    na_kalman(tsAirgap, smooth = FALSE)
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 123.9264 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 232.6461 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 304.4782 302.5073 298.1029 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 385.8343 465.0000 467.0000
    #> 1958 340.0000 318.0000 392.3223 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 442.0008 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 141.5691 119.0000 104.0000 118.0000
    #> 1950 139.5378 133.0000 141.7595 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 327.9040 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 400.7839
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 3: Perform imputation with KalmanSmooth and StructTS model
    na_kalman(tsAirgap, model = "StructTS", smooth = TRUE)
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 123.5734 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 260.8078 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 312.6060 311.6160 311.0830 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 414.6160 465.0000 467.0000
    #> 1958 340.0000 318.0000 382.6912 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 452.5683 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 132.7833 119.0000 104.0000 118.0000
    #> 1950 157.1427 133.0000 130.8971 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 298.6147 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 390.9089
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 4: Perform imputation with KalmanSmooth and StructTS model with additional parameters
    na_kalman(tsAirgap, model = "StructTS", smooth = TRUE, type = "trend")
    #>         Jan    Feb    Mar    Apr    May    Jun    Jul    Aug    Sep    Oct
    #> 1949 112.00 118.00 132.00 129.00 132.00 135.00 148.00 148.00 133.50 119.00
    #> 1950 115.00 126.00 141.00 135.00 125.00 149.00 170.00 170.00 151.50 133.00
    #> 1951 145.00 150.00 178.00 163.00 172.00 178.00 199.00 199.00 184.00 162.00
    #> 1952 171.00 180.00 193.00 181.00 183.00 218.00 230.00 242.00 209.00 191.00
    #> 1953 196.00 196.00 236.00 235.00 229.00 243.00 264.00 272.00 237.00 211.00
    #> 1954 204.00 188.00 235.00 227.00 234.00 268.00 302.00 293.00 259.00 229.00
    #> 1955 242.00 233.00 267.00 269.00 270.00 315.00 364.00 347.00 312.00 274.00
    #> 1956 284.00 277.00 301.25 325.50 349.75 374.00 413.00 405.00 355.00 306.00
    #> 1957 315.00 301.00 356.00 348.00 355.00 410.00 465.00 467.00 404.00 347.00
    #> 1958 340.00 318.00 333.00 348.00 363.00 435.00 491.00 505.00 404.00 359.00
    #> 1959 360.00 342.00 406.00 396.00 420.00 472.00 548.00 559.00 463.00 407.00
    #> 1960 417.00 391.00 419.00 461.00 498.00 535.00 622.00 606.00 508.00 461.00
    #>         Nov    Dec
    #> 1949 104.00 118.00
    #> 1950 136.50 140.00
    #> 1951 146.00 166.00
    #> 1952 172.00 194.00
    #> 1953 180.00 201.00
    #> 1954 203.00 229.00
    #> 1955 237.00 278.00
    #> 1956 271.00 306.00
    #> 1957 341.50 336.00
    #> 1958 310.00 337.00
    #> 1959 362.00 389.50
    #> 1960 390.00 432.00
    
    # Example 5:  Perform imputation with KalmanSmooth and user created model
    usermodel <- arima(tsAirgap, order = c(1, 0, 1))$model
    na_kalman(tsAirgap, model = usermodel)
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 128.9291 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 270.8450 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 292.8668 316.5817 341.4769 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 409.7449 465.0000 467.0000
    #> 1958 340.0000 318.0000 329.7566 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 487.7672 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 136.5832 119.0000 104.0000 118.0000
    #> 1950 150.9669 133.0000 132.7634 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 332.3090 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 388.2858
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 6: Same as example 1, just written with pipe operator
    tsAirgap %>% na_kalman()
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 123.5734 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 260.8078 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 312.6060 311.6160 311.0830 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 414.6160 465.0000 467.0000
    #> 1958 340.0000 318.0000 382.6912 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 452.5683 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 132.7833 119.0000 104.0000 118.0000
    #> 1950 157.1427 133.0000 130.8971 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 298.6147 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 390.9089
    #> 1960 508.0000 461.0000 390.0000 432.0000
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_locf.html ================================================ Missing Value Imputation by Last Observation Carried Forward — na_locf • imputeTS

    Replaces each missing value with the most recent present value prior to it (Last Observation Carried Forward- LOCF). Optionally this can also be done starting from the back of the series (Next Observation Carried Backward - NOCB).

    na_locf(x, option = "locf", na_remaining = "rev", maxgap = Inf)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    option

    Algorithm to be used. Accepts the following input:

    • "locf" - for Last Observation Carried Forward (default choice)

    • "nocb" - for Next Observation Carried Backward

    na_remaining

    Method to be used for remaining NAs.

    • "rev" - to perform nocb / locf from the reverse direction (default choice)

    • "keep" - to return the series with NAs

    • "rm" - to remove remaining NAs

    • "mean" - to replace remaining NAs by overall mean

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Details

    General Functionality

    Replaces each missing value with the most recent present value prior to it (Last Observation Carried Forward - LOCF). This can also be done in reverse direction, starting from the end of the series (then called Next Observation Carried Backward - NOCB).

    Handling for NAs at the beginning of the series

    In case one or more successive observations directly at the start of the time series are NA, there exists no 'last value' yet, that can be carried forward. Thus, no LOCF imputation can be performed for these NAs. As soon as the first non-NA value appears, LOCF can be performed as expected. The same applies to NOCB, but from the opposite direction.

    While this problem might appear seldom and will only affect a very small amount of values at the beginning, it is something to consider. The na_remaining parameter helps to define, what should happen with these values at the start, that would remain NA after pure LOCF.

    Default setting is na_remaining = "rev", which performs nocb / locf from the other direction to fill these NAs. So a NA at the beginning will be filled with the next non-NA value appearing in the series.

    With na_remaining = "keep" NAs at the beginning (that can not be imputed with pure LOCF) are just left as remaining NAs.

    With na_remaining = "rm" NAs at the beginning of the series are completely removed. Thus, the time series is basically shortened.

    Also available is na_remaining = "mean", which uses the overall mean of the time series to replace these remaining NAs. (but beware, mean is usually not a good imputation choice - even if it only affects the values at the beginning)

    Author

    Steffen Moritz

    Examples

    # Prerequisite: Create Time series with missing values
    x <- ts(c(NA, 3, 4, 5, 6, NA, 7, 8))
    
    # Example 1: Perform LOCF
    na_locf(x)
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 3 3 4 5 6 6 7 8
    
    # Example 2: Perform NOCF
    na_locf(x, option = "nocb")
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 3 3 4 5 6 7 7 8
    
    # Example 3: Perform LOCF and remove remaining NAs
    na_locf(x, na_remaining = "rm")
    #> [1] 3 4 5 6 6 7 8
    
    # Example 4: Same as example 1, just written with pipe operator
    x %>% na_locf()
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 3 3 4 5 6 6 7 8
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_ma.html ================================================ Missing Value Imputation by Weighted Moving Average — na_ma • imputeTS

    Missing value replacement by weighted moving average. Uses semi-adaptive window size to ensure all NAs are replaced.

    na_ma(x, k = 4, weighting = "exponential", maxgap = Inf)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    k

    integer width of the moving average window. Expands to both sides of the center element e.g. k=2 means 4 observations (2 left, 2 right) are taken into account. If all observations in the current window are NA, the window size is automatically increased until there are at least 2 non-NA values present.

    weighting

    Weighting to be used. Accepts the following input:

    • "simple" - Simple Moving Average (SMA)

    • "linear" - Linear Weighted Moving Average (LWMA)

    • "exponential" - Exponential Weighted Moving Average (EWMA) (default choice)

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Details

    In this function missing values get replaced by moving average values. Moving Averages are also sometimes referred to as "moving mean", "rolling mean", "rolling average" or "running average".

    The mean in this implementation taken from an equal number of observations on either side of a central value. This means for an NA value at position i of a time series, the observations i-1,i+1 and i+1, i+2 (assuming a window size of k=2) are used to calculate the mean.

    Since it can in case of long NA gaps also occur, that all values next to the central value are also NA, the algorithm has a semi-adaptive window size. Whenever there are less than 2 non-NA values in the complete window available, the window size is incrementally increased, till at least 2 non-NA values are there. In all other cases the algorithm sticks to the pre-set window size.

    There are options for using Simple Moving Average (SMA), Linear Weighted Moving Average (LWMA) and Exponential Weighted Moving Average (EWMA).

    SMA: all observations in the window are equally weighted for calculating the mean.

    LWMA: weights decrease in arithmetical progression. The observations directly next to a central value i, have weight 1/2, the observations one further away (i-2,i+2) have weight 1/3, the next (i-3,i+3) have weight 1/4, ...

    EWMA: uses weighting factors which decrease exponentially. The observations directly next to a central value i, have weight 1/2^1, the observations one further away (i-2,i+2) have weight 1/2^2, the next (i-3,i+3) have weight 1/2^3, ...

    Author

    Steffen Moritz

    Examples

    # Example 1: Perform imputation with simple moving average
    na_ma(tsAirgap, weighting = "simple")
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 131.7143 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 245.8750 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 310.5000 338.5000 351.3333 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 380.3750 465.0000 467.0000
    #> 1958 340.0000 318.0000 375.8571 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 494.8750 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 126.7143 119.0000 104.0000 118.0000
    #> 1950 147.4286 133.0000 155.1429 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 382.4286 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 434.8750
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 2: Perform imputation with exponential weighted moving average
    na_ma(tsAirgap)
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 133.6552 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 259.1000 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 294.7778 334.3571 369.2778 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 399.6000 465.0000 467.0000
    #> 1958 340.0000 318.0000 350.7931 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 501.7000 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 129.8276 119.0000 104.0000 118.0000
    #> 1950 152.0000 133.0000 144.3077 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 361.2069 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 406.9333
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 3: Perform imputation with exponential weighted moving average, window size 6
    na_ma(tsAirgap, k = 6)
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 133.1597 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 256.6349 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 298.0641 330.4516 362.3590 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 396.9677 465.0000 467.0000
    #> 1958 340.0000 318.0000 354.1311 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 499.0161 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 129.8607 119.0000 104.0000 118.0000
    #> 1950 151.7909 133.0000 144.8091 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 360.9500 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 410.7661
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 4: Same as example 1, just written with pipe operator
    tsAirgap %>% na_ma(weighting = "simple")
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 131.7143 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 245.8750 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 310.5000 338.5000 351.3333 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 380.3750 465.0000 467.0000
    #> 1958 340.0000 318.0000 375.8571 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 494.8750 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 126.7143 119.0000 104.0000 118.0000
    #> 1950 147.4286 133.0000 155.1429 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 382.4286 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 434.8750
    #> 1960 508.0000 461.0000 390.0000 432.0000
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_mean.html ================================================ Missing Value Imputation by Mean Value — na_mean • imputeTS

    Missing value replacement by mean values. Different means like median, mean, mode possible.

    na_mean(x, option = "mean", maxgap = Inf)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    option

    Algorithm to be used. Accepts the following input:

    • "mean" - take the mean for imputation (default choice)

    • "median" - take the median for imputation

    • "mode" - take the mode for imputation

    • "harmonic" - take the harmonic mean

    • "geometric" - take the geometric mean

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Details

    Missing values get replaced by overall mean values. The function calculates the mean, median, mode, harmonic or geometric mean over all the non-NA values and replaces all NAs with this value. Option 'mode' replaces NAs with the most frequent value in the time series. If two or more values occur equally frequent, the function imputes the lower value. Due to their calculation formula geometric and harmonic mean are not well defined for negative values or zero values in the input series.

    In general using the mean for imputation imputation is mostly a suboptimal choice and should be handled with great caution.

    Author

    Steffen Moritz

    Examples

    # Prerequisite: Create Time series with missing values
    x <- ts(c(2, 3, 4, 5, 6, NA, 7, 8))
    
    # Example 1: Perform imputation with the overall mean
    na_mean(x)
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2 3 4 5 6 5 7 8
    
    # Example 2: Perform imputation with overall median
    na_mean(x, option = "median")
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2 3 4 5 6 5 7 8
    
    # Example 3: Same as example 1, just written with pipe operator
    x %>% na_mean()
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2 3 4 5 6 5 7 8
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_random.html ================================================ Missing Value Imputation by Random Sample — na_random • imputeTS

    Replaces each missing value by drawing a random sample between two given bounds.

    na_random(x, lower_bound = NULL, upper_bound = NULL, maxgap = Inf)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    lower_bound

    Lower bound for the random samples. If nothing or NULL is set min(x) will be used.

    upper_bound

    Upper bound for the random samples. If nothing or NULL is set man(x) will be used.

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Details

    Replaces each missing value by drawing a random sample between two given bounds. The default bounds are the minimum and the maximum value in the non-NAs from the time series. Function uses runif function to get the random values.

    Author

    Steffen Moritz

    Examples

    # Prerequisite: Create Time series with missing values
    x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8))
    
    # Example 1: Replace all NAs by random values that are between min and max of the input time series
    na_random(x)
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.000000 3.000000 4.423466 5.000000 6.000000 3.310586 7.000000 8.000000
    
    # Example 2: Replace all NAs by random values between 1 and 10
    na_random(x, lower_bound = 1, upper_bound = 10)
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.000000 3.000000 4.765253 5.000000 6.000000 7.019837 7.000000 8.000000
    
    # Example 3: Same as example 1, just written with pipe operator
    x %>% na_random()
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.000000 3.000000 5.045902 5.000000 6.000000 5.962156 7.000000 8.000000
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_remove.html ================================================ Remove Missing Values — na_remove • imputeTS

    Removes all missing values from a time series.

    na_remove(x)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    Value

    Vector (vector)

    Details

    Removes all missing values from a input time series. This shortens the time series by the number of missing values in the series. Should be handled with care, because this can affect the seasonality of the time series. Seasonal patterns might be destroyed. Independent from the input, this function only returns a vector. (the time information of a resulting time series object wouldn't be correct any more).

    Author

    Steffen Moritz

    Examples

    # Example 1: Remove all NAs
    # Create Time series with missing values
    x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8))
    
    # Example 1: Remove all NAs
    na_remove(x)
    #> [1] 2 3 5 6 7 8
    
    # Example 2: Remove all NAs in tsAirgap
    na_remove(tsAirgap)
    #>   [1] 112 118 132 129 135 148 148 119 104 118 115 126 141 135 125 149 170 170
    #>  [19] 133 140 145 150 178 163 172 178 199 199 184 162 146 166 171 180 193 181
    #>  [37] 183 218 230 242 209 191 172 194 196 196 236 235 229 243 264 272 237 211
    #>  [55] 180 201 204 188 235 227 234 302 293 259 229 203 229 242 233 267 269 270
    #>  [73] 315 364 347 312 274 237 278 284 277 374 413 405 355 306 271 306 315 301
    #>  [91] 356 348 355 465 467 404 347 336 340 318 348 363 435 491 505 404 359 310
    #> [109] 337 360 342 406 396 420 472 548 559 463 407 362 417 391 419 461 535 622
    #> [127] 606 508 461 390 432
    
    # Example 3: Same as example 1, just written with pipe operator
    x %>% na_remove()
    #> [1] 2 3 5 6 7 8
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_replace.html ================================================ Replace Missing Values by a Defined Value — na_replace • imputeTS

    Replaces all missing values with a given value.

    na_replace(x, fill = 0, maxgap = Inf)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    fill

    Value used to replace the missing values

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Author

    Steffen Moritz

    Examples

    # Prerequisite: Create Time series with missing values
    x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8))
    
    # Example 1: Replace all NAs with 3.5
    na_replace(x, fill = 3.5)
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.0 3.0 3.5 5.0 6.0 3.5 7.0 8.0
    
    # Example 2: Replace all NAs with 0
    na_replace(x, fill = 0)
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2 3 0 5 6 0 7 8
    
    # Example 3: Same as example 1, just written with pipe operator
    x %>% na_replace(fill = 3.5)
    #> Time Series:
    #> Start = 1 
    #> End = 8 
    #> Frequency = 1 
    #> [1] 2.0 3.0 3.5 5.0 6.0 3.5 7.0 8.0
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_seadec.html ================================================ Seasonally Decomposed Missing Value Imputation — na_seadec • imputeTS

    Removes the seasonal component from the time series, performs imputation on the deseasonalized series and afterwards adds the seasonal component again.

    na_seadec(
      x,
      algorithm = "interpolation",
      find_frequency = FALSE,
      maxgap = Inf,
      ...
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    algorithm

    Algorithm to be used after decomposition. Accepts the following input:

    • "interpolation" - Imputation by Interpolation (default choice)

    • "locf" - Imputation by Last Observation Carried Forward

    • "mean" - Imputation by Mean Value

    • "random" - Imputation by Random Sample

    • "kalman" - Imputation by Kalman Smoothing and State Space Models

    • "ma" - Imputation by Weighted Moving Average

    find_frequency

    If TRUE the algorithm will try to estimate the frequency of the time-series automatically.

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    ...

    Additional parameters for these algorithms that can be passed through. Look at na_interpolation, na_locf, na_random, na_mean for parameter options.

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Details

    The algorithm first performs a Seasonal Decomposition of Time Series by Loess via stl. Decomposing the time series into seasonal, trend and irregular components. The seasonal component gets then removed (subtracted) from the original series. As a second step the selected imputation algorithm e.g. na_locf, na_ma, ... is applied on the deseasonalized series. Thus, the algorithm can work without being affected by seasonal patterns. After filling the NA gaps, the seasonal component is added to the deseasonalized series again.

    Implementation details: A paper about the STL Decomposition procedure is linked in the references. Since the function only works with complete data, the initial NA data is temporarily filled via linear interpolation in order to perform the decomposition. These temporarily imputed values are replaced with NAs again after obtaining the decomposition for the non-NA observations. STL decomposition is run with robust = TRUE and s.window = 11. Additionally, applying STL decomposition needs a preset frequency. This can be passed by the frequency set in the input ts object or by setting 'find_frequency=TRUE' in order to find an appropriate frequency for the time series. The find_frequency parameter internally uses findfrequency, which does a spectral analysis of the time series for identifying a suitable frequency. Using find_frequency will update the previously set frequency of a ts object to the newly found frequency. The default is 'find_frequency = FALSE', which gives a warning if no seasonality is set for the supplied time series object. If neither seasonality is set nor find_frequency is set to TRUE, the function goes on without decomposition and just applies the selected secondary algorithm to the original time series that still includes seasonality.

    References

    R. B. Cleveland, W. S. Cleveland, J.E. McRae, and I. Terpenning (1990) STL: A Seasonal-Trend Decomposition Procedure Based on Loess. Journal of Official Statistics, 6, 3–73.

    Author

    Steffen Moritz

    Examples

    # Example 1: Perform seasonal imputation using algorithm = "interpolation"
    na_seadec(tsAirgap, algorithm = "interpolation")
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 121.3941 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 274.9995 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 321.8776 321.4178 329.4329 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 424.1798 465.0000 467.0000
    #> 1958 340.0000 318.0000 357.2362 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 477.2415 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 131.6204 119.0000 104.0000 118.0000
    #> 1950 149.7682 133.0000 113.0095 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 305.0315 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 398.3917
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 2: Perform seasonal imputation using algorithm = "mean"
    na_seadec(tsAirgap, algorithm = "mean")
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 278.5342 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 310.9817 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 282.0424 276.8787 280.1899 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 323.7648 465.0000 467.0000
    #> 1958 340.0000 318.0000 281.3777 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 281.0843 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 289.2064 119.0000 104.0000 118.0000
    #> 1950 289.6443 133.0000 237.2661 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 215.2535 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 242.2860
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 3: Same as example 1, just written with pipe operator
    tsAirgap %>% na_seadec(algorithm = "interpolation")
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 121.3941 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 274.9995 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 321.8776 321.4178 329.4329 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 424.1798 465.0000 467.0000
    #> 1958 340.0000 318.0000 357.2362 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 477.2415 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 131.6204 119.0000 104.0000 118.0000
    #> 1950 149.7682 133.0000 113.0095 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 305.0315 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 398.3917
    #> 1960 508.0000 461.0000 390.0000 432.0000
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/na_seasplit.html ================================================ Seasonally Splitted Missing Value Imputation — na_seasplit • imputeTS

    Splits the times series into seasons and afterwards performs imputation separately for each of the resulting time series datasets (each containing the data for one specific season).

    na_seasplit(
      x,
      algorithm = "interpolation",
      find_frequency = FALSE,
      maxgap = Inf,
      ...
    )

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object in which missing values shall be replaced

    algorithm

    Algorithm to be used after splits. Accepts the following input:

    • "interpolation" - Imputation by Interpolation (default choice)

    • "locf" - Imputation by Last Observation Carried Forward

    • "mean" - Imputation by Mean Value

    • "random" - Imputation by Random Sample

    • "kalman" - Imputation by Kalman Smoothing and State Space Models

    • "ma" - Imputation by Weighted Moving Average

    find_frequency

    If TRUE the algorithm will try to estimate the frequency of the time-series automatically.

    maxgap

    Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.

    ...

    Additional parameters for these algorithms that can be passed through. Look at na_interpolation, na_locf, na_random, na_mean for parameter options.

    Value

    Vector (vector) or Time Series (ts) object (dependent on given input at parameter x)

    Author

    Steffen Moritz

    Examples

    # Example 1: Perform seasonal splitted imputation using algorithm = "interpolation"
    na_seasplit(tsAirgap, algorithm = "interpolation")
    #>        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec
    #> 1949 112.0 118.0 132.0 129.0 125.0 135.0 148.0 148.0 184.0 119.0 104.0 118.0
    #> 1950 115.0 126.0 141.0 135.0 125.0 149.0 170.0 170.0 184.0 133.0 125.0 140.0
    #> 1951 145.0 150.0 178.0 163.0 172.0 178.0 199.0 199.0 184.0 162.0 146.0 166.0
    #> 1952 171.0 180.0 193.0 181.0 183.0 218.0 230.0 242.0 209.0 191.0 172.0 194.0
    #> 1953 196.0 196.0 236.0 235.0 229.0 243.0 264.0 272.0 237.0 211.0 180.0 201.0
    #> 1954 204.0 188.0 235.0 227.0 234.0 279.0 302.0 293.0 259.0 229.0 203.0 229.0
    #> 1955 242.0 233.0 267.0 269.0 270.0 315.0 364.0 347.0 312.0 274.0 237.0 278.0
    #> 1956 284.0 277.0 311.5 308.5 312.5 374.0 413.0 405.0 355.0 306.0 271.0 306.0
    #> 1957 315.0 301.0 356.0 348.0 355.0 404.5 465.0 467.0 404.0 347.0 290.5 336.0
    #> 1958 340.0 318.0 381.0 348.0 363.0 435.0 491.0 505.0 404.0 359.0 310.0 337.0
    #> 1959 360.0 342.0 406.0 396.0 420.0 472.0 548.0 559.0 463.0 407.0 362.0 384.5
    #> 1960 417.0 391.0 419.0 461.0 420.0 535.0 622.0 606.0 508.0 461.0 390.0 432.0
    
    # Example 2: Perform seasonal splitted imputation using algorithm = "mean"
    na_seasplit(tsAirgap, algorithm = "mean")
    #>           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
    #> 1949 112.0000 118.0000 132.0000 129.0000 261.2222 135.0000 148.0000 148.0000
    #> 1950 115.0000 126.0000 141.0000 135.0000 125.0000 149.0000 170.0000 170.0000
    #> 1951 145.0000 150.0000 178.0000 163.0000 172.0000 178.0000 199.0000 199.0000
    #> 1952 171.0000 180.0000 193.0000 181.0000 183.0000 218.0000 230.0000 242.0000
    #> 1953 196.0000 196.0000 236.0000 235.0000 229.0000 243.0000 264.0000 272.0000
    #> 1954 204.0000 188.0000 235.0000 227.0000 234.0000 305.4000 302.0000 293.0000
    #> 1955 242.0000 233.0000 267.0000 269.0000 270.0000 315.0000 364.0000 347.0000
    #> 1956 284.0000 277.0000 256.3000 262.9091 261.2222 374.0000 413.0000 405.0000
    #> 1957 315.0000 301.0000 356.0000 348.0000 355.0000 305.4000 465.0000 467.0000
    #> 1958 340.0000 318.0000 256.3000 348.0000 363.0000 435.0000 491.0000 505.0000
    #> 1959 360.0000 342.0000 406.0000 396.0000 420.0000 472.0000 548.0000 559.0000
    #> 1960 417.0000 391.0000 419.0000 461.0000 261.2222 535.0000 622.0000 606.0000
    #>           Sep      Oct      Nov      Dec
    #> 1949 333.5000 119.0000 104.0000 118.0000
    #> 1950 333.5000 133.0000 237.5000 140.0000
    #> 1951 184.0000 162.0000 146.0000 166.0000
    #> 1952 209.0000 191.0000 172.0000 194.0000
    #> 1953 237.0000 211.0000 180.0000 201.0000
    #> 1954 259.0000 229.0000 203.0000 229.0000
    #> 1955 312.0000 274.0000 237.0000 278.0000
    #> 1956 355.0000 306.0000 271.0000 306.0000
    #> 1957 404.0000 347.0000 237.5000 336.0000
    #> 1958 404.0000 359.0000 310.0000 337.0000
    #> 1959 463.0000 407.0000 362.0000 248.8182
    #> 1960 508.0000 461.0000 390.0000 432.0000
    
    # Example 3: Same as example 1, just written with pipe operator
    tsAirgap %>% na_seasplit(algorithm = "interpolation")
    #>        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec
    #> 1949 112.0 118.0 132.0 129.0 125.0 135.0 148.0 148.0 184.0 119.0 104.0 118.0
    #> 1950 115.0 126.0 141.0 135.0 125.0 149.0 170.0 170.0 184.0 133.0 125.0 140.0
    #> 1951 145.0 150.0 178.0 163.0 172.0 178.0 199.0 199.0 184.0 162.0 146.0 166.0
    #> 1952 171.0 180.0 193.0 181.0 183.0 218.0 230.0 242.0 209.0 191.0 172.0 194.0
    #> 1953 196.0 196.0 236.0 235.0 229.0 243.0 264.0 272.0 237.0 211.0 180.0 201.0
    #> 1954 204.0 188.0 235.0 227.0 234.0 279.0 302.0 293.0 259.0 229.0 203.0 229.0
    #> 1955 242.0 233.0 267.0 269.0 270.0 315.0 364.0 347.0 312.0 274.0 237.0 278.0
    #> 1956 284.0 277.0 311.5 308.5 312.5 374.0 413.0 405.0 355.0 306.0 271.0 306.0
    #> 1957 315.0 301.0 356.0 348.0 355.0 404.5 465.0 467.0 404.0 347.0 290.5 336.0
    #> 1958 340.0 318.0 381.0 348.0 363.0 435.0 491.0 505.0 404.0 359.0 310.0 337.0
    #> 1959 360.0 342.0 406.0 396.0 420.0 472.0 548.0 559.0 463.0 407.0 362.0 384.5
    #> 1960 417.0 391.0 419.0 461.0 420.0 535.0 622.0 606.0 508.0 461.0 390.0 432.0
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/plotNA.distribution.html ================================================ Discontinued - Use ggplot_na_distribution instead. — plotNA.distribution • imputeTS

    plotNA.distribution was replaced by ggplot_na_distribution. The new plotting function provides an improved version of the old plot, e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions.

    plotNA.distribution(x, ...)

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/plotNA.distributionBar.html ================================================ Discontinued - Use ggplot_na_distribution2 instead. — plotNA.distributionBar • imputeTS

    plotNA.distributionBar was replaced by ggplot_na_distribution2. The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions.

    plotNA.distributionBar(x, ...)

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/plotNA.gapsize.html ================================================ Discontinued - Use ggplot_na_gapsize instead. — plotNA.gapsize • imputeTS

    plotNA.gapsize was replaced by ggplot_na_gapsize. The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions.

    plotNA.gapsize(x, ...)

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/plotNA.imputations.html ================================================ Discontinued - Use ggplot_na_imputations instead. — plotNA.imputations • imputeTS

    plotNA.imputations was replaced by ggplot_na_imputations. The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions.

    plotNA.imputations(x, ...)

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/reexports.html ================================================ Objects exported from other packages — reexports • imputeTS

    These objects are imported from other packages. Follow the links below to see their documentation.

    magrittr

    %>%

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/statsNA.html ================================================ Print Statistics about Missing Values — statsNA • imputeTS

    Print summary stats about the distribution of missing values in a univariate time series.

    statsNA(x, bins = 4, print_only = TRUE)

    Arguments

    x

    Numeric Vector (vector) or Time Series (ts) object containing NAs

    bins

    Split number for bin stats. Number of bins the time series gets divided into. For each bin information about amount/percentage of missing values is printed. Default value is 4 - what means stats about the 1st,2nd,3rd,4th quarter of the time series are shown.

    print_only

    Choose if the function Prints or Returns. For print_only = TRUE the function has no return value and just prints out missing value stats. If print_only is changed to FALSE, nothing is printed and the function returns a list.Print gives a little bit more information, since the returned list does not include "Stats for Bins" and "overview NA series"

    Value

    A list containing the stats. Beware: Function gives only a return value if print_only = FALSE.

    Details

    Prints the following information about the missing values in the time series:

    • "Length of time series" - Number of observations in the time series (including NAs)

    • "Number of Missing Values" - Number of missing values in the time series

    • "Percentage of Missing Values" - Percentage of missing values in the time series

    • "Number of Gaps" - Number of NA gaps (consisting of one or more consecutive NAs) in the time series

    • "Average Gap Size" - Average size of consecutive NAs for the NA gaps in the time series

    • "Stats for Bins" - Number/percentage of missing values for the split into bins

    • "Longest NA gap" - Longest series of consecutive missing values (NAs in a row) in the time series

    • "Most frequent gap size" - Most frequent occurring series of missing values in the time series

    • "Gap size accounting for most NAs" - The series of consecutive missing values that accounts for most missing values overall in the time series

    • "Overview NA series" - Overview about how often each series of consecutive missing values occurs. Series occurring 0 times are skipped

    It is furthermore, important to note, that you are able to choose whether the function returns a list or prints the information only. (see description of parameter "print_only")

    Author

    Steffen Moritz

    Examples

    # Example 1: Print stats about the missing data in tsNH4
    statsNA(tsNH4)
    #> [1] "Length of time series:"
    #> [1] 4552
    #> [1] "-------------------------"
    #> [1] "Number of Missing Values:"
    #> [1] 883
    #> [1] "-------------------------"
    #> [1] "Percentage of Missing Values:"
    #> [1] "19.4%"
    #> [1] "-------------------------"
    #> [1] "Number of Gaps:"
    #> [1] 155
    #> [1] "-------------------------"
    #> [1] "Average Gap Size:"
    #> [1] 5.696774
    #> [1] "-------------------------"
    #> [1] "Stats for Bins"
    #> [1] "  Bin 1 (1138 values from 1 to 1138) :      233 NAs (20.5%)"
    #> [1] "  Bin 2 (1138 values from 1139 to 2276) :      433 NAs (38%)"
    #> [1] "  Bin 3 (1138 values from 2277 to 3414) :      135 NAs (11.9%)"
    #> [1] "  Bin 4 (1138 values from 3415 to 4552) :      82 NAs (7.21%)"
    #> [1] "-------------------------"
    #> [1] "Longest NA gap (series of consecutive NAs)"
    #> [1] "157 in a row"
    #> [1] "-------------------------"
    #> [1] "Most frequent gap size (series of consecutive NA series)"
    #> [1] "1 NA in a row (occurring 68 times)"
    #> [1] "-------------------------"
    #> [1] "Gap size accounting for most NAs"
    #> [1] "157 NA in a row (occurring 1 times, making up for overall 157 NAs)"
    #> [1] "-------------------------"
    #> [1] "Overview NA series"
    #> [1] "  1 NA in a row: 68 times"
    #> [1] "  2 NA in a row: 26 times"
    #> [1] "  3 NA in a row: 16 times"
    #> [1] "  4 NA in a row: 10 times"
    #> [1] "  5 NA in a row: 8 times"
    #> [1] "  6 NA in a row: 4 times"
    #> [1] "  7 NA in a row: 2 times"
    #> [1] "  8 NA in a row: 3 times"
    #> [1] "  9 NA in a row: 2 times"
    #> [1] "  10 NA in a row: 1 times"
    #> [1] "  11 NA in a row: 1 times"
    #> [1] "  12 NA in a row: 2 times"
    #> [1] "  14 NA in a row: 1 times"
    #> [1] "  16 NA in a row: 1 times"
    #> [1] "  17 NA in a row: 1 times"
    #> [1] "  21 NA in a row: 1 times"
    #> [1] "  25 NA in a row: 1 times"
    #> [1] "  26 NA in a row: 1 times"
    #> [1] "  27 NA in a row: 1 times"
    #> [1] "  32 NA in a row: 1 times"
    #> [1] "  42 NA in a row: 2 times"
    #> [1] "  91 NA in a row: 1 times"
    #> [1] "  157 NA in a row: 1 times"
    
    # Example 2: Return list with stats about the missing data in tsAirgap
    statsNA(tsAirgap, print_only = FALSE)
    #> $length_series
    #> [1] 144
    #> 
    #> $number_NAs
    #> [1] 13
    #> 
    #> $number_na_gaps
    #> [1] 11
    #> 
    #> $average_size_na_gaps
    #> [1] 1.181818
    #> 
    #> $percentage_NAs
    #> [1] "9.03%"
    #> 
    #> $longest_na_gap
    #> [1] 3
    #> 
    #> $most_frequent_na_gap
    #> [1] 1
    #> 
    #> $most_weighty_na_gap
    #> [1] 1
    #> 
    #> $df_distribution_na_gaps
    #>   [1] 10  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    #>  [26]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    #>  [51]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    #>  [76]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    #> [101]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    #> [126]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    #> 
    
    # Example 3: Same as example 1, just written with pipe operator
    tsNH4 %>% statsNA()
    #> [1] "Length of time series:"
    #> [1] 4552
    #> [1] "-------------------------"
    #> [1] "Number of Missing Values:"
    #> [1] 883
    #> [1] "-------------------------"
    #> [1] "Percentage of Missing Values:"
    #> [1] "19.4%"
    #> [1] "-------------------------"
    #> [1] "Number of Gaps:"
    #> [1] 155
    #> [1] "-------------------------"
    #> [1] "Average Gap Size:"
    #> [1] 5.696774
    #> [1] "-------------------------"
    #> [1] "Stats for Bins"
    #> [1] "  Bin 1 (1138 values from 1 to 1138) :      233 NAs (20.5%)"
    #> [1] "  Bin 2 (1138 values from 1139 to 2276) :      433 NAs (38%)"
    #> [1] "  Bin 3 (1138 values from 2277 to 3414) :      135 NAs (11.9%)"
    #> [1] "  Bin 4 (1138 values from 3415 to 4552) :      82 NAs (7.21%)"
    #> [1] "-------------------------"
    #> [1] "Longest NA gap (series of consecutive NAs)"
    #> [1] "157 in a row"
    #> [1] "-------------------------"
    #> [1] "Most frequent gap size (series of consecutive NA series)"
    #> [1] "1 NA in a row (occurring 68 times)"
    #> [1] "-------------------------"
    #> [1] "Gap size accounting for most NAs"
    #> [1] "157 NA in a row (occurring 1 times, making up for overall 157 NAs)"
    #> [1] "-------------------------"
    #> [1] "Overview NA series"
    #> [1] "  1 NA in a row: 68 times"
    #> [1] "  2 NA in a row: 26 times"
    #> [1] "  3 NA in a row: 16 times"
    #> [1] "  4 NA in a row: 10 times"
    #> [1] "  5 NA in a row: 8 times"
    #> [1] "  6 NA in a row: 4 times"
    #> [1] "  7 NA in a row: 2 times"
    #> [1] "  8 NA in a row: 3 times"
    #> [1] "  9 NA in a row: 2 times"
    #> [1] "  10 NA in a row: 1 times"
    #> [1] "  11 NA in a row: 1 times"
    #> [1] "  12 NA in a row: 2 times"
    #> [1] "  14 NA in a row: 1 times"
    #> [1] "  16 NA in a row: 1 times"
    #> [1] "  17 NA in a row: 1 times"
    #> [1] "  21 NA in a row: 1 times"
    #> [1] "  25 NA in a row: 1 times"
    #> [1] "  26 NA in a row: 1 times"
    #> [1] "  27 NA in a row: 1 times"
    #> [1] "  32 NA in a row: 1 times"
    #> [1] "  42 NA in a row: 2 times"
    #> [1] "  91 NA in a row: 1 times"
    #> [1] "  157 NA in a row: 1 times"
    

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/tsAirgap.html ================================================ Time series of monthly airline passengers (with NAs) — tsAirgap • imputeTS

    Monthly totals of international airline passengers, 1949 to 1960. This time series contains missing values. In the package included is also the tsAirgapComplete time series providing the true values for the missing values.

    tsAirgap

    Format

    Time Series (ts) with 144 rows including 13 NAs.

    Source

    Box, G. E. P., Jenkins, G. M., Reinsel, G. C. and Ljung, G. M. (2015). Time series analysis: forecasting and control. Fifth Edition. John Wiley and Sons.

    Details

    The dataset originates from Box and Jenkins (see citation) and is a commonly used example in time series analysis literature.

    It characteristics (strong trend, strong seasonal behavior) make it also a great example for time series imputation. Thus the version with inserted NA gaps was created under the name tsAirgap.

    In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied.

    There are the two time series:

    • tsAirgap - The time series with NAs.

    • tsAirgapComplete - Time series without NAs.

    See also

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/tsAirgapComplete.html ================================================ Time series of monthly airline passengers (complete) — tsAirgapComplete • imputeTS

    Monthly totals of international airline passengers, 1949 to 1960. This time series provides the truth for the missing values of the tsAirgap time series. Thus it is identical to the tsAirgap time series except that no value is missing.

    tsAirgapComplete

    Format

    Time Series (ts) with 144 rows.

    Source

    Box, G. E. P., Jenkins, G. M., Reinsel, G. C. and Ljung, G. M. (2015). Time series analysis: forecasting and control. Fifth Edition. John Wiley and Sons.

    Details

    The dataset originates from Box and Jenkins (see citation) and is a commonly used example in time series analysis literature.

    It characteristics (strong trend, strong seasonal behavior) make it also a great example for time series imputation. Thus the version with inserted NA gaps was created under the name tsAirgap.

    In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied.

    There are the two time series:

    • tsAirgap - The time series with NAs.

    • tsAirgapComplete - Time series without NAs.

    See also

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/tsHeating.html ================================================ Time series of a heating systems supply temperature (with NAs) — tsHeating • imputeTS

    Time series of a heating systems supply temperature. Measured from 18.11.2013 - 05:12:00 to 13.01.2015 - 15:08:00 in 1 minute steps. This time series contains missing values. In the package included is also the tsHeatingComplete time series providing the true values for the missing values.

    tsHeating

    Format

    Time Series (ts) with 606837 rows including 57391 NAs.

    Source

    Moritz, Steffen, Friese, Martina, Fischbach, Andreas, Schlitt, Christopher, and Bartz-Beielstein, Thomas. (2015, May 1). GECCO Industrial Challenge 2015 Dataset: A heating system dataset for the 'Recovering missing information in heating system operating data' competition at the Genetic and Evolutionary Computation Conference 2015, Madrid, Spain. http://doi.org/10.5281/zenodo.3884899

    Details

    The time series originates from the GECCO Industrial Challenge 2015. This Challenge was about "Recovering missing information in heating system operating data". Goal was to impute missing values in heating system sensor data as accurate as possible. (doi:10.5281/zenodo.3884899 )

    In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series.

    There are the two time series:

    • tsHeating - The time series with NAs.

    • tsHeatingComplete - Time series without NAs.

    See also

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/tsHeatingComplete.html ================================================ Time series of a heating systems supply temperature (complete) — tsHeatingComplete • imputeTS

    Time series of a heating systems supply temperature. Measured from 18.11.2013 - 05:12:00 to 13.01.2015 - 15:08:00 in 1 minute steps. This time series provides the truth for the missing values of the tsHeating time series. Thus it is identical to the heating time series except that no value is missing.

    tsHeatingComplete

    Format

    Time Series (ts) with 606837 rows.

    Source

    Moritz, Steffen, Friese, Martina, Fischbach, Andreas, Schlitt, Christopher, and Bartz-Beielstein, Thomas. (2015, May 1). GECCO Industrial Challenge 2015 Dataset: A heating system dataset for the 'Recovering missing information in heating system operating data' competition at the Genetic and Evolutionary Computation Conference 2015, Madrid, Spain. http://doi.org/10.5281/zenodo.3884899

    Details

    The time series originates from the GECCO Industrial Challenge 2015. This Challenge was about "Recovering missing information in heating system operating data". Goal was to impute missing values in heating system sensor data as accurate as possible. (doi:10.5281/zenodo.3884899 )

    In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series.

    There are the two time series:

    • tsHeating - The time series with NAs.

    • tsHeatingComplete - Time series without NAs.

    See also

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/tsNH4.html ================================================ Time series of NH4 concentration in a wastewater system (with NAs) — tsNH4 • imputeTS

    Time series of NH4 concentration in a wastewater system. Measured from 30.11.2010 - 16:10 to 01.01.2011 - 6:40 in 10 minute steps. This time series contains missing values. In the package included is also the tsNH4Complete time series providing the true values for the missing values.

    tsNH4

    Format

    Time Series (ts) with 4552 rows including 883 NAs.

    Source

    Friese, Martina, Fischbach, Andreas, Flasch, Oliver, Mersmann, Olaf, Bartz-Beielstein, Thomas, and Walbeck, Klaus. (2014, July 16). GECCO Industrial Challenge 2014 Dataset: A water quality dataset for the 'Active protection against pollution of the surface water' competition at the Genetic and Evolutionary Computation Conference 2015, Vancouver, Canada. http://www.spotseven.de/gecco-challenge/gecco-challenge-2014

    Details

    The time series is derived from the dataset of the GECCO Industrial Challenge 2014.

    In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series.

    There are the two time series:

    • tsNH4 - The time series with NAs.

    • tsNH4Complete - Time series without NAs.

    See also

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/reference/tsNH4Complete.html ================================================ Time series of NH4 concentration in a wastewater system (complete) — tsNH4Complete • imputeTS

    Time series of NH4 concentration in a wastewater system. Measured from 30.11.2010 - 16:10 to 01.01.2011 - 6:40 in 10 minute steps. This time series provides the truth for the missing values of the tsNH4 time series. Thus it is identical to the heating time series except that no value is missing.

    tsNH4Complete

    Format

    Time Series (ts) with 4552 rows.

    Source

    Friese, Martina, Fischbach, Andreas, Flasch, Oliver, Mersmann, Olaf, Bartz-Beielstein, Thomas, and Walbeck, Klaus. (2014, July 16). GECCO Industrial Challenge 2014 Dataset: A water quality dataset for the 'Active protection against pollution of the surface water' competition at the Genetic and Evolutionary Computation Conference 2015, Vancouver, Canada. http://www.spotseven.de/gecco-challenge/gecco-challenge-2014#'

    Details

    The time series is derived from the dataset of the GECCO Industrial Challenge 2014.

    In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series.

    There are the two time series:

    • tsNH4 - The time series with NAs.

    • tsNH4Complete - Time series without NAs.

    See also

    Site built with pkgdown 2.1.3.

    ================================================ FILE: docs/sitemap.xml ================================================ https://SteffenMoritz.github.io/imputeTS/404.html https://SteffenMoritz.github.io/imputeTS/articles/gallery_visualizations.html https://SteffenMoritz.github.io/imputeTS/articles/index.html https://SteffenMoritz.github.io/imputeTS/authors.html https://SteffenMoritz.github.io/imputeTS/index.html https://SteffenMoritz.github.io/imputeTS/news/index.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_distribution.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_distribution2.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_gapsize.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_gapsize2.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_imputations.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_intervals.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_level.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_level2.html https://SteffenMoritz.github.io/imputeTS/reference/ggplot_na_pattern.html https://SteffenMoritz.github.io/imputeTS/reference/imputeTS-package.html https://SteffenMoritz.github.io/imputeTS/reference/index.html https://SteffenMoritz.github.io/imputeTS/reference/na.interpolation.html https://SteffenMoritz.github.io/imputeTS/reference/na.kalman.html https://SteffenMoritz.github.io/imputeTS/reference/na.locf.html https://SteffenMoritz.github.io/imputeTS/reference/na.ma.html https://SteffenMoritz.github.io/imputeTS/reference/na.mean.html https://SteffenMoritz.github.io/imputeTS/reference/na.random.html https://SteffenMoritz.github.io/imputeTS/reference/na.remove.html https://SteffenMoritz.github.io/imputeTS/reference/na.replace.html https://SteffenMoritz.github.io/imputeTS/reference/na.seadec.html https://SteffenMoritz.github.io/imputeTS/reference/na.seasplit.html https://SteffenMoritz.github.io/imputeTS/reference/na_interpolation.html https://SteffenMoritz.github.io/imputeTS/reference/na_kalman.html https://SteffenMoritz.github.io/imputeTS/reference/na_locf.html https://SteffenMoritz.github.io/imputeTS/reference/na_ma.html https://SteffenMoritz.github.io/imputeTS/reference/na_mean.html https://SteffenMoritz.github.io/imputeTS/reference/na_random.html https://SteffenMoritz.github.io/imputeTS/reference/na_remove.html https://SteffenMoritz.github.io/imputeTS/reference/na_replace.html https://SteffenMoritz.github.io/imputeTS/reference/na_seadec.html https://SteffenMoritz.github.io/imputeTS/reference/na_seasplit.html https://SteffenMoritz.github.io/imputeTS/reference/plotNA.distribution.html https://SteffenMoritz.github.io/imputeTS/reference/plotNA.distributionBar.html https://SteffenMoritz.github.io/imputeTS/reference/plotNA.gapsize.html https://SteffenMoritz.github.io/imputeTS/reference/plotNA.imputations.html https://SteffenMoritz.github.io/imputeTS/reference/reexports.html https://SteffenMoritz.github.io/imputeTS/reference/statsNA.html https://SteffenMoritz.github.io/imputeTS/reference/tsAirgap.html https://SteffenMoritz.github.io/imputeTS/reference/tsAirgapComplete.html https://SteffenMoritz.github.io/imputeTS/reference/tsHeating.html https://SteffenMoritz.github.io/imputeTS/reference/tsHeatingComplete.html https://SteffenMoritz.github.io/imputeTS/reference/tsNH4.html https://SteffenMoritz.github.io/imputeTS/reference/tsNH4Complete.html ================================================ FILE: imputeTS.Rproj ================================================ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace ================================================ FILE: inst/CITATION ================================================ year <- sub("-.*", "", meta$Date) vers <- paste("R package version", meta$Version) citHeader("To cite the imputeTS package, use:") bibentry(bibtype = "Article", title = "{imputeTS: Time Series Missing Value Imputation in R}", author = c(person("Steffen Moritz"),person("Thomas Bartz-Beielstein")), journal = "{The R Journal}", volume = 9, number = 1, pages = "207--218", year = 2017, doi = "10.32614/RJ-2017-009") ================================================ FILE: man/ggplot_na_distribution.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ggplot_na_distribution.R \name{ggplot_na_distribution} \alias{ggplot_na_distribution} \title{Line Plot to Visualize the Distribution of Missing Values} \usage{ ggplot_na_distribution( x, x_axis_labels = NULL, color_points = "steelblue", color_lines = "steelblue2", color_missing = "indianred", color_missing_border = "indianred", alpha_missing = 0.5, title = "Distribution of Missing Values", subtitle = "Time Series with highlighted missing regions", xlab = "Time", ylab = "Value", shape_points = 20, size_points = 2.5, theme = ggplot2::theme_linedraw() ) } \arguments{ \item{x}{Numeric Vector (\code{\link[base]{vector}}) or Time Series (\code{\link[stats]{ts}}) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.} \item{x_axis_labels}{For adding specific x-axis labels. Takes a vector of \code{\link[base]{Date}} or \code{\link[base]{POSIXct}} objects as an input (needs the same length as x) . The Default (NULL) uses the observation numbers as x-axis tick labels.} \item{color_points}{Color for the Symbols/Points.} \item{color_lines}{Color for the Lines.} \item{color_missing}{Color used for highlighting the time spans with NA values.} \item{color_missing_border}{Color used as border for time spans with NA values.} \item{alpha_missing}{Alpha (transparency) value used for color_missing.} \item{title}{Title of the Plot (NULL for deactivating title).} \item{subtitle}{Subtitle of the Plot (NULL for deactivating subtitle).} \item{xlab}{Label for x-Axis.} \item{ylab}{Label for y-Axis.} \item{shape_points}{Symbol to use for the Observations/Points. See https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference.} \item{size_points}{Size of Symbols/Points.} \item{theme}{Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (\code{\link[ggplot2]{theme_linedraw})}} } \description{ Visualize the distribution of missing values within a time series. } \details{ This function visualizes the distribution of missing values within a time series. If a value is NA, the background is colored differently. This gives a good overview of where most missing values occur. The only really needed parameter for this function is x (the univariate time series that shall be visualized). All other parameters are solely for altering the appearance of the plot. As long as the input is univariate and numeric the function also takes data.frame, tibble, tsibble, zoo, xts as an input. The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made. For very long time series it might happen, that the plot gets too crowded and overplotting issues occur. In this case the \code{\link[imputeTS]{ggplot_na_distribution2}} plotting function can provide a more condensed overview. } \examples{ # Example 1: Visualize the missing values in x x <- stats::ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7)) ggplot_na_distribution(x) # Example 2: Visualize the missing values in tsAirgap time series ggplot_na_distribution(tsAirgap) # Example 3: Same as example 1, just written with pipe operator x <- ts(c(1:11, 4:9, NA, NA, NA, 11:15, 7:15, 15:6, NA, NA, 2:5, 3:7)) x \%>\% ggplot_na_distribution() # Example 4: Visualize NAs in tsAirgap - different color for points # Plot adjustments via ggplot_na_distribution function parameters ggplot_na_distribution(tsAirgap, color_points = "grey") # Example 5: Visualize NAs in tsAirgap - different theme # Plot adjustments via ggplot_na_distribution function parameters ggplot_na_distribution(tsAirgap, theme = ggplot2::theme_classic()) # Example 6: Visualize NAs in tsAirgap - title, subtitle in center # Plot adjustments via ggplot2 syntax ggplot_na_distribution(tsAirgap) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) # Example 7: Visualize NAs in tsAirgap - title in center, no subtitle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_distribution(tsAirgap, subtitle = NULL) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) # Example 8: Visualize NAs in tsAirgap - x-axis texts with angle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_distribution(tsAirgap, color_points = "grey") + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) } \seealso{ \code{\link[imputeTS]{ggplot_na_distribution2}}, \code{\link[imputeTS]{ggplot_na_gapsize}}, \code{\link[imputeTS]{ggplot_na_gapsize2}}, \code{\link[imputeTS]{ggplot_na_imputations}} } \author{ Steffen Moritz, Sebastian Gatscha } ================================================ FILE: man/ggplot_na_distribution2.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ggplot_na_distribution2.R \name{ggplot_na_distribution2} \alias{ggplot_na_distribution2} \title{Stacked Bar Plot to Visualize Missing Values per Time Interval} \usage{ ggplot_na_distribution2( x, number_intervals = NULL, interval_size = NULL, measure = "percent", color_missing = "indianred2", color_existing = "steelblue", alpha_missing = 0.8, alpha_existing = 0.3, title = "Missing Values per Interval", subtitle = "Amount of NA and non-NA for successive intervals", xlab = "Time Lapse (Interval Size: XX)", ylab = NULL, color_border = "white", theme = ggplot2::theme_linedraw() ) } \arguments{ \item{x}{Numeric Vector (\code{\link[base]{vector}}) or Time Series (\code{\link[stats]{ts}}) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.} \item{number_intervals}{Defines the number of bins to be created. Default number of intervals (denoted by NULL) is calculated by \code{\link[grDevices]{nclass.Sturges}} using Sturges' formula. If the interval_size parameter is set to a value different to NULL this parameter is ignored.} \item{interval_size}{Defines how many observations should be in one bin/interval. The required number of overall bins is afterwards calculated automatically. If used this parameter overwrites the number_intervals parameter. For a very long time series be sure to make the interval_size not extremely small, otherwise because of overplotting issues nothing can be seen until you also increase the plot width.} \item{measure}{Whether the NA / non-NA ratio should be given as percent or absolute numbers. \itemize{ \item{"percent" - for percentages} \item{"count" - for absolute numbers of NAs} }} \item{color_missing}{Color for the amount of missing values.} \item{color_existing}{Color for the amount of existing values.} \item{alpha_missing}{Alpha (transparency) value for the missing values.} \item{alpha_existing}{Alpha (transparency) value for the existing values.} \item{title}{Title of the Plot (NULL for deactivating title).} \item{subtitle}{Subtitle of the Plot (NULL for deactivating subtitle).} \item{xlab}{Label for x-Axis. Automatically set to the current interval size, if no custom text is chosen.} \item{ylab}{Label for y-Axis. As default (NULL), the axis is automatically set to either 'Percent' or 'Count' dependent on the settings of parameter \code{measure}.} \item{color_border}{Color for the small borders between the intervals/bins. Default is 'white'.} \item{theme}{Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (\code{\link[ggplot2]{theme_linedraw})}} } \description{ Visualization of missing values in barplot form. Especially useful when looking at specific intervals and for time series with a lot of observations. } \details{ This function visualizes the distribution of missing values within a time series. In comparison to the \code{\link[imputeTS]{ggplot_na_distribution}} function this is not done by plotting each observation of the time series separately. Instead observations for time intervals are represented as intervals/bins of multiple values. For these intervals information about the amount of missing values are shown. This has the advantage, that also for large time series a plot which is easy to overview can be created. The only really needed parameter for this function is x (the univariate time series that shall be visualized). All other parameters are solely for altering the appearance of the plot. As long as the input is univariate and numeric the function also takes data.frame, tibble, tsibble, zoo, xts as an input. The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made. } \examples{ # Example 1: Visualize the missing values in tsNH4 time series as percentages ggplot_na_distribution2(tsNH4) # Example 2: Visualize the missing values in tsNH4 time series as counts ggplot_na_distribution2(tsNH4, measure = "count") # Example 3: Visualize the missing values in tsHeating time series ggplot_na_distribution2(tsHeating) # Example 4: Same as example 1, just written with pipe operator tsNH4 \%>\% ggplot_na_distribution2() # Example 5: Visualize NAs in tsNH4 - exactly 8 intervals ggplot_na_distribution2(tsNH4, number_intervals = 8) # Example 6: Visualize NAs in tsNH4 - 300 observations per interval ggplot_na_distribution2(tsNH4, interval_size = 300) # Example 7: Visualize NAs in tsAirgap - different color for NAs # Plot adjustments via ggplot_na_distribution2 function parameters ggplot_na_distribution2(tsAirgap, color_missing = "pink") # Example 8: Visualize NAs in tsNH4 - different theme # Plot adjustments via ggplot_na_distribution2 function parameters ggplot_na_distribution2(tsNH4, theme = ggplot2::theme_classic()) # Example 9: Visualize NAs in tsAirgap - title, subtitle in center # Plot adjustments via ggplot2 syntax ggplot_na_distribution2(tsAirgap) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggtext::element_markdown(hjust = 0.5)) # Example 10: Visualize NAs in tsAirgap - title in center, no subtitle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_distribution2(tsAirgap, subtitle = NULL) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) # Example 11: Visualize NAs in tsAirgap - x-axis texts with angle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_distribution2(tsAirgap, color_missing = "grey") + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) } \seealso{ \code{\link[imputeTS]{ggplot_na_distribution}}, \code{\link[imputeTS]{ggplot_na_gapsize}}, \code{\link[imputeTS]{ggplot_na_gapsize2}}, \code{\link[imputeTS]{ggplot_na_imputations}} } \author{ Steffen Moritz, Sebastian Gatscha } ================================================ FILE: man/ggplot_na_gapsize.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ggplot_na_gapsize.R \name{ggplot_na_gapsize} \alias{ggplot_na_gapsize} \title{Bar Plot to Visualize Occurrences of Different NA Gap Sizes} \usage{ ggplot_na_gapsize( x, limit = 10, include_total = TRUE, ranked_by = "occurrence", color_occurrence = "indianred", color_total = "steelblue", color_border = "black", alpha_bars = 1, title = "Occurrence of gap sizes", subtitle = "Gap sizes (NAs in a row) ordered by most common", xlab = NULL, ylab = "Number occurrence", legend = TRUE, orientation = "horizontal", label_occurrence = "Number occurrence gapsize", label_total = "Resulting NAs for gapsize", theme = ggplot2::theme_linedraw() ) } \arguments{ \item{x}{Numeric Vector (\code{\link[base]{vector}}) or Time Series (\code{\link[stats]{ts}}) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.} \item{limit}{Specifies how many of the most common gap sizes are shown in the plot.Default is 10. So only the 10 most often occurring gapsizes will be shown. If more or all present gap sizes should be displayed, the limit needs to be increased. Since this might add a lot of additional data, having parameter \code{orientation} set to 'horizontal' avoids overlaps in the axis labels.} \item{include_total}{When set to TRUE the total NA count for a gapsize is included in the plot (total = number occurrence x gap size). E.g. if a gapsize of 3 occurs 10 times, this means this gap size makes up for 30 NAs in total. This can be a good indicator of the overall impact of a gapsize.} \item{ranked_by}{Should the results be sorted according to the number of occurrence or total resulting NAs for a gapsize. Total resulting NAs are calculated by (total = number occurrence x gap size). \itemize{ \item{"occurrence" - Sorting by 'number of occurrence' of a gap size} \item{"total" - Sorting by 'total resulting NAs' of a gap size} } The default setting is "occurrence".} \item{color_occurrence}{Defines the Color for the Bars of 'number of occurrence'.} \item{color_total}{Defines the color for the bars of 'total resulting NAs'.} \item{color_border}{Defines the color for the border of the bars.} \item{alpha_bars}{Alpha (transparency) value used for filling the bars.} \item{title}{Title of the Plot.} \item{subtitle}{Subtitle of the Plot.} \item{xlab}{Label for x-Axis.} \item{ylab}{Label for y-Axis.} \item{legend}{If TRUE a legend is added at the bottom.} \item{orientation}{Can be either 'vertical' or 'horizontal'. Defines if the bars are plotted vertically or horizontally. For large amounts of different gap sizes horizontal illustration is favorable (also see parameter \code{limit}).} \item{label_occurrence}{Defines the label assigned to 'number of occurrence' in the legend.} \item{label_total}{Defines the label assigned to 'total resulting NAs' in the legend.} \item{theme}{Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (\code{\link[ggplot2]{theme_linedraw})}} } \value{ The output is a \code{\link[ggplot2]{ggplot2}} object that can be further adjusted by using the ggplot syntax } \description{ Visualize the Number of Occurrences for existing NA Gap Sizes (NAs in a row) in a Time Series } \details{ This plotting function can be used to visualize the length of the NA gaps (NAs in a row) in a time series. It shows a ranking of which gap sizes occur most often. This ranking can be ordered by the number occurrence of the gap sizes or by total resulting NAs for this gap size (occurrence * gap length). A NA-gap of 3 occurring 10 times means 30 total resulting NAs. A resulting plot can for example be described like this: a 2 NA-gap (2 NAs in a row) occurred 27 times, a 9 NA-gap (9 NAs in a row) occurred 11 times, a 27 NA-gap (27 NAs in a row) occurred 1 times, ... The only really needed parameter for this function is x (the univariate time series with NAs that shall be visualized). All other parameters are solely for altering the appearance of the plot. As long as the input is univariate and numeric, the function also takes data.frame, tibble, tsibble, zoo, xts as an input. The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made. } \examples{ # Example 1: Visualize the top gap sizes in tsNH4 (top 10 by default) ggplot_na_gapsize(tsNH4) # Example 2: Visualize the top gap sizes in tsAirgap - horizontal bars ggplot_na_gapsize(tsAirgap, orientation = "vertical") # Example 3: Same as example 1, just written with pipe operator tsNH4 \%>\% ggplot_na_gapsize() # Example 4: Visualize the top 20 gap sizes in tsNH4 ggplot_na_gapsize(tsNH4, limit = 20) # Example 5: Visualize top gap sizes in tsNH4 without showing total NAs ggplot_na_gapsize(tsNH4, limit = 20, include_total = FALSE) # Example 6: Visualize top gap sizes in tsNH4 but ordered by total NAs # (total = occurrence * gap length) ggplot_na_gapsize(tsNH4, limit = 20, ranked_by = "total") # Example 7: Visualize top gap sizes in tsNH4 - different theme # Plot adjustments via ggplot_na_gapsize function parameters ggplot_na_gapsize(tsNH4, theme = ggplot2::theme_classic()) # Example 8: Visualize top gap sizes in tsNH4 - title, subtitle in center # Plot adjustments via ggplot2 syntax ggplot_na_gapsize(tsNH4) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) # Example 9: Visualize top gap sizes in tsNH4 - title in center, no subtitle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_gapsize(tsNH4, subtitle = NULL) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) # Example 10: Top gap sizes in tsNH4 - legend on the right and color change # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_gapsize(tsNH4, color_total = "grey") + ggplot2::theme(legend.position = "right") } \seealso{ \code{\link[imputeTS]{ggplot_na_gapsize2}}, \code{\link[imputeTS]{ggplot_na_distribution}}, \code{\link[imputeTS]{ggplot_na_distribution2}}, \code{\link[imputeTS]{ggplot_na_imputations}} } \author{ Steffen Moritz, Sebastian Gatscha } ================================================ FILE: man/ggplot_na_gapsize2.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ggplot_na_gapsize2.R \name{ggplot_na_gapsize2} \alias{ggplot_na_gapsize2} \title{Bubble Plot to Visualize Total NA Count of NA gap sizes} \usage{ ggplot_na_gapsize2( x, colors_bubbles = c("#FCFBFF", "#EFEEFA", "#DDDAEF", "#C8C3E2", "#B1AAD4", "#9A8FC4", "#8273B5", "#6B56A7", "#553695", "#3D1778"), color_border = "black", alpha_bubbles = 0.4, labels_bubbles = "none", size_bubbles = 25, min_totals = NULL, min_occurrence = NULL, min_gapsize = NULL, max_gapsize = NULL, title = "Gap Size Analysis", subtitle = "Total NA counts for different gapsizes", xlab = "Gapsize", ylab = "Number occurrence", legend = TRUE, legend_breaks = 4, legend_title = "Total NAs", legend_position = "right", legend_point_sizes = "default", theme = ggplot2::theme_linedraw() ) } \arguments{ \item{x}{Numeric Vector (\code{\link[base]{vector}}) or Time Series (\code{\link[stats]{ts}}) object containing NAs. This is the only mandatory parameter - all other parameters are only needed for adjusting the plot appearance.} \item{colors_bubbles}{Choose a color gradient that encodes lower to higher total NA counts. Color codes can be given as vector. Using color palettes from colorspace, grDevices, RColorBrewer or other packages is useful here. E.g. grDevices::heat.colors(10) would be a possible input.} \item{color_border}{Color for the border of the bubbles.} \item{alpha_bubbles}{Alpha (transparency) value used for filling the bubbles.} \item{labels_bubbles}{Should labels be added to the individual bubbles inside the plot. For many datasets there will be overplotting issues once labels are added. In these cases using the min_gapsize, min_totals or min_occurrence options might be useful to only display the most relevant gap sizes. You can choose between these labels to be added: \itemize{ \item{"none" - No label gets added to the bubbles} (default choice) \item{"gap" - Adds a label displaying the gap size belonging to the respective bubble} \item{"total" - Adds a label displaying the total NA count for the respective bubble} \item{"gap-occurrence" - Adds a label displaying the respective gap size and number of its occurrence} } The default setting is "none".} \item{size_bubbles}{Allows to scale the size of the bubbles. Some experimenting with this parameter might be needed to get a good visualization for your specific dataset.} \item{min_totals}{Only print bubbles for gap sizes that account for at least min_totals NAs in the time series.} \item{min_occurrence}{Only print bubbles for gap sizes that occur at least min_occurrence times in the time series.} \item{min_gapsize}{Only show gap sizes larger than min_gapsize. Together with max_gapsize enables zooming into in certain regions of interest.} \item{max_gapsize}{Only show gapsizes smaller than max_gapsize. Together with min_gapsize enables zooming into in certain regions of interest.} \item{title}{Title of the Plot.} \item{subtitle}{Subtitle of the Plot.} \item{xlab}{Label for x-Axis.} \item{ylab}{Label for y-Axis.} \item{legend}{If TRUE a legend is added on the right side} \item{legend_breaks}{Number of displayed breaks / labels in the legend. Needs an integer giving the desired number of breaks as input. Breakpoints are internally calculated by R's pretty() function, which can also lead to values slightly smaller or larger than the desired number.} \item{legend_title}{Defines the title of the legend.} \item{legend_position}{Defines position of the legend. Choose either 'bottom', right', 'left' or 'top'.} \item{legend_point_sizes}{Defines the size of the symbols representing the total NA bubbles in the legend. You can choose between "default", "actual" or a custom vector of sizes. \itemize{ \item{"default" - Scales the points in the legend to symbolically resemble the size differences} (default choice) \item{"actual" - Scales the points in the legend according to their actual size in the plot} } Since these two options are not be always sufficient, a custom vector of sizes can be used as input. This would look like this: c(4,5,6,7). Be aware, that the length of this vector must match the number of breakpoints (can be adjusted with legend_breaks).} \item{theme}{Set a theme for ggplot2. Default is ggplot2::theme_linedraw(). (\code{\link[ggplot2]{theme_linedraw})}} } \value{ The output is a \code{\link[ggplot2]{ggplot2}} object that can be further adjusted by using the ggplot syntax } \description{ Visualize the total NA count (gap size * occurrence) for the existing gaps sizes (NAs in a row). } \details{ This function visualizes total NA counts by individual gap size (consecutive NAs) in a time series. The bubble plot makes it easy to see which gap sizes account for most of the NAs in the series. The size and color of the bubbles represent the total number of NAs a given gap size accounts for. Total NAs for a gap size are calculated as follows: total NAs = occurrence * gap length For example, interpret a bubble for gap size 2 as follows: a 2-NA gap (two NAs in a row) occurred 27 times in the time series and thus accounts for 54 total NAs. On the x-axis, the different gap sizes are plotted in increasing order. The y-axis shows the occurrence count of these gap sizes in the time series. The plot is useful for investigating possible root causes of the missing data. It can indicate whether the missing data are random or whether there are patterns of interest. Depending on the input time series, there might be too much information in the plot, leading to overplotting. In these cases, use the parameters \code{min_totals}, \code{min_occurrence}, and \code{min_gapsize} to display only the information of interest. The only required parameter is \code{x} (the univariate time series with NAs to visualize). All other parameters alter the appearance of the plot. As long as the input is univariate and numeric, the function also accepts \code{data.frame}, \code{tibble}, \code{tsibble}, \code{zoo}, or \code{xts} input. The plot can be adjusted via function parameters. For more complex adjustments, you can modify the result using ggplot2 syntax, since the function returns a ggplot2 object. See the Examples for typical adjustments. } \examples{ # Example 1: Visualize total NA counts in tsNH4 ggplot_na_gapsize2(tsNH4) # Example 2: Visualize total NA counts in tsNH4, different color gradient ggplot_na_gapsize2(tsNH4, colors_bubbles = rev(grDevices::heat.colors(10))) # Example 3: Same as example 1, just written with pipe operator tsNH4 \%>\% ggplot_na_gapsize2() # Example 4: Visualize total NA counts in tsHeating # Limited to gap sizes that account for a total of > 600 NAs ggplot_na_gapsize2(tsHeating, min_totals = 600) # Example 5: Visualize total NA counts in tsNH4 - no legend ggplot_na_gapsize2(tsNH4, legend = FALSE) # Example 6: Visualize total NA counts in tsAirgap - increased bubble size ggplot_na_gapsize2(tsAirgap, size_bubbles = 35) # Example 7: Visualize total NA counts in tsNH4 # Plot adjustments via ggplot_na_gapsize2 function parameters ggplot_na_gapsize2(tsNH4, theme = ggplot2::theme_classic()) # Example 8: Visualize total NA counts in tsNH4 - title, subtitle in center # Plot adjustments via ggplot2 syntax ggplot_na_gapsize2(tsNH4) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) # Example 9: Visualize total NA counts in tsNH4 - title in center, no subtitle # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_gapsize2(tsNH4, subtitle = NULL) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) # Example 10: Total NA counts in tsNH4 - legend on the bottom and color change # Plot adjustments via ggplot2 syntax and function parameters ggplot_na_gapsize2(tsNH4, colors_bubbles = grDevices::heat.colors(10)) + ggplot2::theme(legend.position = "bottom") } \seealso{ \code{\link[imputeTS]{ggplot_na_distribution}}, \code{\link[imputeTS]{ggplot_na_distribution2}}, \code{\link[imputeTS]{ggplot_na_gapsize}}, \code{\link[imputeTS]{ggplot_na_imputations}} } \author{ Steffen Moritz } ================================================ FILE: man/ggplot_na_imputations.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ggplot_na_imputations.R \name{ggplot_na_imputations} \alias{ggplot_na_imputations} \title{Line Plot to Visualize Imputed Values} \usage{ ggplot_na_imputations( x_with_na, x_with_imputations, x_with_truth = NULL, x_axis_labels = NULL, title = "Imputed Values", subtitle = "Visualization of missing value replacements", xlab = "Time", ylab = "Value", color_points = "steelblue", color_imputations = "indianred", color_truth = "seagreen3", color_lines = "lightslategray", shape_points = 16, shape_imputations = 18, shape_truth = 16, size_points = 1.5, size_imputations = 2.5, size_truth = 1.5, width_lines = 0.5, linetype = "solid", connect_na = TRUE, legend = TRUE, legend_size = 5, label_known = "known values", label_imputations = "imputed values", label_truth = "ground truth", theme = ggplot2::theme_linedraw() ) } \arguments{ \item{x_with_na}{Numeric Vector or Time Series (\code{\link{ts}}) object with NAs before imputation. This parameter and x_with_imputation shave to be set. The rest of the parameters are mostly needed for adjusting the plot appearance.} \item{x_with_imputations}{Numeric Vector or Time Series (\code{\link{ts}}) object with NAs replaced by imputed values. This parameter and x_with_imputation shave to be set.The rest of the parameters are mostly needed for adjusting the plot appearance.} \item{x_with_truth}{Numeric Vector or Time Series (\code{\link{ts}}) object with the real values (optional parameter). If the ground truth is known (e.g. in experiments where the missing values were artificially added) it can be displayed in the plot with this parameter. Default is NULL (ground truth not known).} \item{x_axis_labels}{For adding specific x-axis labels. Takes a vector of \code{\link[base]{Date}} or \code{\link[base]{POSIXct}} objects as an input (needs the same length as x_with_na). The Default (NULL) uses the observation numbers as x-axis tick labels.} \item{title}{Title of the Plot.} \item{subtitle}{Subtitle of the Plot.} \item{xlab}{Label for x-Axis.} \item{ylab}{Label for y-Axis.} \item{color_points}{Color for the Symbols/Points of the non-NA Observations.} \item{color_imputations}{Color for the Symbols/Points of the Imputed Values.} \item{color_truth}{Color for the Symbols/Points of the NA value Ground Truth (only relevant when x_with_truth available).} \item{color_lines}{Color for the Lines connecting the Observations/Points.} \item{shape_points}{Shape for the Symbols/Points of the non-NA observations. See https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference.} \item{shape_imputations}{Shape for the Symbols/Points of the imputed values. See https://ggplot2.tidyverse.org/articles/ggplot2-specs.html as reference.} \item{shape_truth}{Shape for the Symbols/Points of the NA value Ground Truth (only relevant when x_with_truth available).} \item{size_points}{Size for the Symbols/Points of the non-NA Observations.} \item{size_imputations}{Size for the Symbols/Points of the Imputed Values.} \item{size_truth}{Size for the Symbols/Points of the NA value Ground Truth (only relevant when x_with_truth available).} \item{width_lines}{Width for the Lines connecting the Observations/Points.} \item{linetype}{Linetype for the Lines connecting the Observations/Points.} \item{connect_na}{If TRUE the Imputations are connected to the non-NA observations in the plot. Otherwise there are no connecting lines between symbols in NA areas.} \item{legend}{If TRUE a Legend is added at the bottom.} \item{legend_size}{Size of the Symbols used in the Legend.} \item{label_known}{Legend label for the non-NA Observations.} \item{label_imputations}{Legend label for the Imputed Values.} \item{label_truth}{Legend label for the Ground Truth of the NA values.} \item{theme}{Set a Theme for ggplot2. Default is ggplot2::theme_linedraw(). (\code{\link[ggplot2]{theme_linedraw})}} } \description{ Visualize the imputed values in a time series. } \details{ This plot can be used, to visualize imputed values for a time series. Imputed values (filled NA gaps) are shown in a different color than the other values. If real values (ground truth) for the NA gaps are known, they can be optionally added in a different color. The only really needed parameters for this function are x_with_na (the time series with NAs before imputation) and x_with_imputations (the time series without NAs after imputation). All other parameters are msotly for altering the appearance of the plot. As long as the input is univariate and numeric the function also takes data.frame, tibble, tsibble, zoo, xts as an input. The plot can be adjusted to your needs via the function parameters. Additionally, for more complex adjustments, the output can also be adjusted via ggplot2 syntax. This is possible, since the output of the function is a ggplot2 object. Also take a look at the Examples to see how adjustments are made. } \examples{ # Example 1: Visualize imputation by na_mean imp_mean <- na_mean(tsAirgap) ggplot_na_imputations(tsAirgap, imp_mean) # Example 2: Visualize imputation by na_locf and added ground truth imp_locf <- na_locf(tsAirgap) ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_locf, x_with_truth = tsAirgapComplete ) # Example 3: Visualize imputation by na_kalman imp_kalman <- na_kalman(tsAirgap) ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_kalman) # Example 4: Same as example 1, just written with pipe operator tsAirgap \%>\% na_mean() \%>\% ggplot_na_imputations(x_with_na = tsAirgap) # Example 5: Visualize imputation by na_seadec - different color for imputed points # Plot adjustments via ggplot_na_imputations function parameters imp_seadec <- na_seadec(tsAirgap) ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_seadec, color_imputations = "gold") # Example 6: Visualize imputation - different theme, point size imputations # Plot adjustments via ggplot_na_imputations function parameters imp_seadec <- na_seadec(tsAirgap) ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_seadec, theme = ggplot2::theme_classic(), size_imputations = 5) # Example 7: Visualize imputation - title, subtitle in center # Plot adjustments via ggplot2 syntax imp_seadec <- na_seadec(tsAirgap) ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_seadec) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) # Example 8: Visualize imputation - title in center, no subtitle # Plot adjustments via ggplot2 syntax and function parameters imp_mean <- na_mean(tsAirgap) ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp_mean, subtitle = NULL) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) } \seealso{ \code{\link[imputeTS]{ggplot_na_distribution}}, \code{\link[imputeTS]{ggplot_na_distribution2}}, \code{\link[imputeTS]{ggplot_na_gapsize}}, \code{\link[imputeTS]{ggplot_na_gapsize2}} } \author{ Steffen Moritz, Sebastian Gatscha } ================================================ FILE: man/ggplot_na_intervals.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{ggplot_na_intervals} \alias{ggplot_na_intervals} \title{Discontinued - Use \code{\link[imputeTS]{ggplot_na_distribution2}} instead.} \usage{ ggplot_na_intervals(x, ...) } \description{ plotNA.distributionBar was replaced by \code{\link[imputeTS]{ggplot_na_distribution2}}. The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions. } \keyword{internal} ================================================ FILE: man/imputeTS-package.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/imputeTS-package.R \docType{package} \name{imputeTS-package} \alias{imputeTS} \alias{imputeTS-package} \title{imputeTS: Time Series Missing Value Imputation} \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} Imputation (replacement) of missing values in univariate time series. Offers several imputation functions and missing data plots. Available imputation algorithms include: 'Mean', 'LOCF', 'Interpolation', 'Moving Average', 'Seasonal Decomposition', 'Kalman Smoothing on Structural Time Series models', 'Kalman Smoothing on ARIMA models'. Published in Moritz and Bartz-Beielstein (2017) \doi{10.32614/RJ-2017-009}. The imputeTS package is a collection of algorithms and tools for univariate time series imputation. } \details{ The imputeTS package specializes on (univariate) time series imputation. It offers several different imputation algorithm implementations. Beyond the imputation algorithms the package also provides plotting and printing functions of missing data statistics. The package is easy to use: \itemize{ \item To impute (fill all missing values) in a time series \code{x}, run:\cr \code{na_interpolation(x)} \cr \item To plot missing data statistics for a time series \code{x}, run:\cr \code{ggplot_na_distribution(x)}\cr \item To print missing data statistics for a time series \code{x}, run:\cr \code{statsNA(x)}\cr } Every other imputation function (starting with na_'algorithm name') and plotting function (starting with plotNA_'plot name') work the same way as in this example. } \references{ Moritz, Steffen, and Thomas Bartz-Beielstein. "imputeTS: Time Series Missing Value Imputation in R." R Journal 9.1 (2017). doi:10.32614/RJ-2017-009. } \seealso{ Useful links: \itemize{ \item \url{https://github.com/SteffenMoritz/imputeTS} \item \url{https://steffenmoritz.github.io/imputeTS/} \item Report bugs at \url{https://github.com/SteffenMoritz/imputeTS/issues} } } \author{ \strong{Maintainer}: Steffen Moritz \email{steffen.moritz10@gmail.com} (\href{https://orcid.org/0000-0002-0085-1804}{ORCID}) [copyright holder] Authors: \itemize{ \item Sebastian Gatscha \email{sebastian_gatscha@gmx.at} } Other contributors: \itemize{ \item Earo Wang \email{earo.wang@gmail.com} (\href{https://orcid.org/0000-0001-6448-5260}{ORCID}) [contributor] \item Ron Hause \email{ronaldhause@gmail.com} (\href{https://orcid.org/0000-0002-5229-7366}{ORCID}) [contributor] } } \keyword{internal} ================================================ FILE: man/na.interpolation.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.interpolation} \alias{na.interpolation} \title{Deprecated use \code{\link[imputeTS]{na_interpolation}} instead.} \usage{ na.interpolation(x, option = "linear", maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{option}{Algorithm to be used. Accepts the following input: \itemize{ \item{"linear" - for linear interpolation using \link{approx} } (default choice) \item{"spline" - for spline interpolation using \link{spline}} \item{"stine" - for Stineman interpolation using \link[stinepack]{stinterp}} }} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} \item{...}{Additional parameters to be passed through to \link{approx} or \link{spline} interpolation functions} } \description{ na.interpolation is replaced by \code{\link[imputeTS]{na_interpolation}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.kalman.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.kalman} \alias{na.kalman} \title{Deprecated use \code{\link[imputeTS]{na_kalman}} instead.} \usage{ na.kalman(x, model = "StructTS", smooth = TRUE, nit = -1, maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{model}{Model to be used. With this parameter the State Space Model (on which KalmanSmooth is performed) can be chosen. Accepts the following input: \itemize{ \item{"StructTS" - For using a structural model fitted by maximum likelihood (using \link[stats]{StructTS}) } (default choice) \item{"auto.arima" - For using the state space representation of arima model (using \link[forecast]{auto.arima})} } For both auto.arima and StructTS additional parameters for model building can be given with the \dots parameter Additionally it is also possible to use a user created state space model (See code Example 5). This state space model could for example be obtained from another R package for structural time series modeling. Furthermore providing the state space representation of a arima model from \link[stats]{arima} is also possible. But it is important to note, that user created state space models must meet the requirements specified under \link[stats]{KalmanLike}. This means the user supplied state space model has to be in form of a list with at least components T, Z, h , V, a, P, Pn. (more details under \link[stats]{KalmanLike})} \item{smooth}{if \code{TRUE} - \code{\link[stats]{KalmanSmooth}} is used for estimation, if \code{FALSE} - \code{\link[stats]{KalmanRun}} is used. Since KalmanRun is often considered extrapolation KalmanSmooth is usually the better choice for imputation.} \item{nit}{Parameter from Kalman Filtering (see \link[stats]{KalmanLike}). Usually no need to change from default.} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} \item{...}{Additional parameters to be passed through to the functions that build the State Space Models (\link[stats]{StructTS} or \link[forecast]{auto.arima}).} } \description{ na.kalman is replaced by \code{\link[imputeTS]{na_kalman}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.locf.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.locf} \alias{na.locf} \title{Deprecated use \code{\link[imputeTS]{na_locf}} instead.} \usage{ na.locf(x, option = "locf", na.remaining = "rev", maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{option}{Algorithm to be used. Accepts the following input: \itemize{ \item{"locf" - for Last Observation Carried Forward} (default choice) \item{"nocb" - for Next Observation Carried Backward} }} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \description{ na.locf is replaced by \code{\link[imputeTS]{na_locf}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.ma.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.ma} \alias{na.ma} \title{Deprecated use \code{\link[imputeTS]{na_ma}} instead.} \usage{ na.ma(x, k = 4, weighting = "exponential", maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{k}{integer width of the moving average window. Expands to both sides of the center element e.g. k=2 means 4 observations (2 left, 2 right) are taken into account. If all observations in the current window are NA, the window size is automatically increased until there are at least 2 non-NA values present.} \item{weighting}{Weighting to be used. Accepts the following input: \itemize{ \item{"simple" - Simple Moving Average (SMA)} \item{"linear" - Linear Weighted Moving Average (LWMA)} \item{"exponential" - Exponential Weighted Moving Average (EWMA)} (default choice) }} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \description{ na.ma is replaced by \code{\link[imputeTS]{na_ma}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.mean.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.mean} \alias{na.mean} \title{Deprecated use \code{\link[imputeTS]{na_mean}} instead.} \usage{ na.mean(x, option = "mean", maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{option}{Algorithm to be used. Accepts the following input: \itemize{ \item{"mean" - take the mean for imputation (default choice)} \item{"median" - take the median for imputation} \item{"mode" - take the mode for imputation} \item{"harmonic" - take the harmonic mean} \item{"geometric" - take the geometric mean} }} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \description{ na.mean is replaced by \code{\link[imputeTS]{na_mean}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.random.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.random} \alias{na.random} \title{Deprecated use \code{\link[imputeTS]{na_random}} instead.} \usage{ na.random(x, lower_bound = NULL, upper_bound = NULL, maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{lower_bound}{Lower bound for the random samples. If nothing or NULL is set min(x) will be used.} \item{upper_bound}{Upper bound for the random samples. If nothing or NULL is set man(x) will be used.} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \description{ na.random is replaced by \code{\link[imputeTS]{na_random}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.remove.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.remove} \alias{na.remove} \title{Deprecated use \code{\link[imputeTS]{na_remove}} instead.} \usage{ na.remove(x, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} } \description{ na.remove is replaced by \code{\link[imputeTS]{na_remove}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.replace.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.replace} \alias{na.replace} \title{Deprecated use \code{\link[imputeTS]{na_replace}} instead.} \usage{ na.replace(x, fill = 0, maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{fill}{Value used to replace the missing values} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \description{ na.replace is replaced by \code{\link[imputeTS]{na_replace}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.seadec.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.seadec} \alias{na.seadec} \title{Deprecated use \code{\link[imputeTS]{na_seadec}} instead.} \usage{ na.seadec( x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ... ) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{algorithm}{Algorithm to be used after decomposition. Accepts the following input: \itemize{ \item{"interpolation" - Imputation by Interpolation} (default choice) \item{"locf" - Imputation by Last Observation Carried Forward} \item{"mean" - Imputation by Mean Value} \item{"random" - Imputation by Random Sample} \item{"kalman" - Imputation by Kalman Smoothing and State Space Models} \item{"ma" - Imputation by Weighted Moving Average} }} \item{find_frequency}{If TRUE the algorithm will try to estimate the frequency of the time-series automatically.} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} \item{...}{Additional parameters for these algorithms that can be passed through. Look at \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_mean}} for parameter options.} } \description{ na.seadec is replaced by \code{\link[imputeTS]{na_seadec}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na.seasplit.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{na.seasplit} \alias{na.seasplit} \title{Deprecated use \code{\link[imputeTS]{na_seasplit}} instead.} \usage{ na.seasplit( x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ... ) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{algorithm}{Algorithm to be used after splits. Accepts the following input: \itemize{ \item{"interpolation" - Imputation by Interpolation} (default choice) \item{"locf" - Imputation by Last Observation Carried Forward} \item{"mean" - Imputation by Mean Value} \item{"random" - Imputation by Random Sample} \item{"kalman" - Imputation by Kalman Smoothing and State Space Models} \item{"ma" - Imputation by Weighted Moving Average} }} \item{find_frequency}{If TRUE the algorithm will try to estimate the frequency of the time-series automatically.} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} \item{...}{Additional parameters for these algorithms that can be passed through. Look at \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_mean}} for parameter options.} } \description{ na.seasplit is replaced by \code{\link[imputeTS]{na_seasplit}}. The functionality stays the same. The new name better fits modern R code style guidelines (which prefer _ over . in function names). } \keyword{internal} ================================================ FILE: man/na_interpolation.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_interpolation.R \name{na_interpolation} \alias{na_interpolation} \title{Missing Value Imputation by Interpolation} \usage{ na_interpolation(x, option = "linear", maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{option}{Algorithm to be used. Accepts the following input: \itemize{ \item{"linear" - for linear interpolation using \link{approx} } (default choice) \item{"spline" - for spline interpolation using \link{spline}} \item{"stine" - for Stineman interpolation using \link[stinepack]{stinterp}} }} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} \item{...}{Additional parameters to be passed through to \link{approx} or \link{spline} interpolation functions} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Uses either linear, spline or stineman interpolation to replace missing values. } \details{ Missing values get replaced by values of \link{approx}, \link{spline} or \link[stinepack]{stinterp} interpolation. The na_interpolation function also supports the use of additional parameters from the respective underlying interpolation functions. While usually not really needed, it is useful to know that this advanced use is in principle possible. These additional parameters are not specified explicitly in the na_interpolation function documentation. Take a look into the documentation of the \link[stinepack]{stinterp}, \link{approx} and \link{spline} functions to get an overview about these additional parameters. An example for such a parameter is the 'method' argument of spline, which can be used to further specify the type of spline to be used. Possible values are "fmm", "natural", "periodic", "monoH.FC" and "hyman" (as can be seen in the \link{spline} documentation). The respective function call using this additional parameter would look like this: \code{na_interpolation(x, option ="spline", method ="natural")} Like in this example other additional detail parameters (gained from \link{approx}, \link{spline}, \link[stinepack]{stinterp} documentation) can be used by just including them in the na_interpolation function call. As already mentioned, these advanced possibilities for settings parameters are only helpful for specific use cases. For regular use the standard parameters provided directly in the na_interpolation documentation should be more than enough. } \examples{ # Prerequisite: Create Time series with missing values x <- ts(c(2, 3, 4, 5, 6, NA, 7, 8)) # Example 1: Perform linear interpolation na_interpolation(x) # Example 2: Perform spline interpolation na_interpolation(x, option = "spline") # Example 3: Perform stine interpolation na_interpolation(x, option = "stine") # Example 4: Perform linear interpolation, with additional parameter pass through from spline() # Take a look at the 'Details' section of the na_interpolation documentation # for more information about advanced parameter pass through options na_interpolation(x, option ="spline", method ="natural") # Example 5: Same as example 1, just written with pipe operator x \%>\% na_interpolation() # Example 6: Same as example 2, just written with pipe operator x \%>\% na_interpolation(option = "spline") } \references{ Johannesson, Tomas, et al. (2015). "Package stinepack". } \seealso{ \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz, Ron Hause } ================================================ FILE: man/na_kalman.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_kalman.R \name{na_kalman} \alias{na_kalman} \title{Missing Value Imputation by Kalman Smoothing and State Space Models} \usage{ na_kalman(x, model = "StructTS", smooth = TRUE, nit = -1, maxgap = Inf, ...) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{model}{Model to be used. With this parameter the State Space Model (on which KalmanSmooth is performed) can be chosen. Accepts the following input: \itemize{ \item{"StructTS" - For using a structural model fitted by maximum likelihood (using \link[stats]{StructTS}) } (default choice) \item{"auto.arima" - For using the state space representation of arima model (using \link[forecast]{auto.arima})} } For both auto.arima and StructTS additional parameters for model building can be given with the \dots parameter Additionally it is also possible to use a user created state space model (See code Example 5). This state space model could for example be obtained from another R package for structural time series modeling. Furthermore providing the state space representation of a arima model from \link[stats]{arima} is also possible. But it is important to note, that user created state space models must meet the requirements specified under \link[stats]{KalmanLike}. This means the user supplied state space model has to be in form of a list with at least components T, Z, h , V, a, P, Pn. (more details under \link[stats]{KalmanLike})} \item{smooth}{if \code{TRUE} - \code{\link[stats]{KalmanSmooth}} is used for estimation, if \code{FALSE} - \code{\link[stats]{KalmanRun}} is used. Since KalmanRun is often considered extrapolation KalmanSmooth is usually the better choice for imputation.} \item{nit}{Parameter from Kalman Filtering (see \link[stats]{KalmanLike}). Usually no need to change from default.} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} \item{...}{Additional parameters to be passed through to the functions that build the State Space Models (\link[stats]{StructTS} or \link[forecast]{auto.arima}).} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Uses Kalman Smoothing on structural time series models (or on the state space representation of an arima model) for imputation. } \details{ The KalmanSmoother used in this function is \code{\link[stats]{KalmanSmooth}}. It operates either on a \code{Basic Structural Model} obtained by \code{\link[stats]{StructTS}} or the state space representation of a ARMA model obtained by \code{\link[forecast]{auto.arima}}. For an detailed explanation of Kalman Filtering and Space Space Models the following literature is a good starting point: \itemize{ \item{\cite{G. Welch, G. Bishop, An Introduction to the Kalman Filter. SIGGRAPH 2001 Course 8, 1995}} \item{\cite{Harvey, Andrew C. Forecasting, structural time series models and the Kalman filter. Cambridge university press, 1990} } \item{\cite{Grewal, Mohinder S. Kalman filtering. Springer Berlin Heidelberg, 2011}} } } \examples{ # Example 1: Perform imputation with KalmanSmoother and state space representation of arima model na_kalman(tsAirgap) # Example 2: Perform imputation with KalmanRun and state space representation of arima model na_kalman(tsAirgap, smooth = FALSE) # Example 3: Perform imputation with KalmanSmooth and StructTS model na_kalman(tsAirgap, model = "StructTS", smooth = TRUE) # Example 4: Perform imputation with KalmanSmooth and StructTS model with additional parameters na_kalman(tsAirgap, model = "StructTS", smooth = TRUE, type = "trend") # Example 5: Perform imputation with KalmanSmooth and user created model usermodel <- arima(tsAirgap, order = c(1, 0, 1))$model na_kalman(tsAirgap, model = usermodel) # Example 6: Same as example 1, just written with pipe operator tsAirgap \%>\% na_kalman() } \references{ Hyndman RJ and Khandakar Y (2008). "Automatic time series forecasting: the forecast package for R". Journal of Statistical Software, 26(3). } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz } ================================================ FILE: man/na_locf.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_locf.R \name{na_locf} \alias{na_locf} \title{Missing Value Imputation by Last Observation Carried Forward} \usage{ na_locf(x, option = "locf", na_remaining = "rev", maxgap = Inf) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{option}{Algorithm to be used. Accepts the following input: \itemize{ \item{"locf" - for Last Observation Carried Forward} (default choice) \item{"nocb" - for Next Observation Carried Backward} }} \item{na_remaining}{Method to be used for remaining NAs. \itemize{ \item{"rev" - to perform nocb / locf from the reverse direction} (default choice) \item{"keep" - to return the series with NAs} \item{"rm" - to remove remaining NAs} \item{"mean" - to replace remaining NAs by overall mean} }} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Replaces each missing value with the most recent present value prior to it (Last Observation Carried Forward- LOCF). Optionally this can also be done starting from the back of the series (Next Observation Carried Backward - NOCB). } \details{ \subsection{General Functionality}{ Replaces each missing value with the most recent present value prior to it (Last Observation Carried Forward - LOCF). This can also be done in reverse direction, starting from the end of the series (then called Next Observation Carried Backward - NOCB). } \subsection{Handling for NAs at the beginning of the series}{ In case one or more successive observations directly at the start of the time series are NA, there exists no 'last value' yet, that can be carried forward. Thus, no LOCF imputation can be performed for these NAs. As soon as the first non-NA value appears, LOCF can be performed as expected. The same applies to NOCB, but from the opposite direction. While this problem might appear seldom and will only affect a very small amount of values at the beginning, it is something to consider. The \code{na_remaining} parameter helps to define, what should happen with these values at the start, that would remain NA after pure LOCF. Default setting is \code{na_remaining = "rev"}, which performs nocb / locf from the other direction to fill these NAs. So a NA at the beginning will be filled with the next non-NA value appearing in the series. With \code{na_remaining = "keep"} NAs at the beginning (that can not be imputed with pure LOCF) are just left as remaining NAs. With \code{na_remaining = "rm"} NAs at the beginning of the series are completely removed. Thus, the time series is basically shortened. Also available is \code{na_remaining = "mean"}, which uses the overall mean of the time series to replace these remaining NAs. (but beware, mean is usually not a good imputation choice - even if it only affects the values at the beginning) } } \examples{ # Prerequisite: Create Time series with missing values x <- ts(c(NA, 3, 4, 5, 6, NA, 7, 8)) # Example 1: Perform LOCF na_locf(x) # Example 2: Perform NOCF na_locf(x, option = "nocb") # Example 3: Perform LOCF and remove remaining NAs na_locf(x, na_remaining = "rm") # Example 4: Same as example 1, just written with pipe operator x \%>\% na_locf() } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz } ================================================ FILE: man/na_ma.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_ma.R \name{na_ma} \alias{na_ma} \title{Missing Value Imputation by Weighted Moving Average} \usage{ na_ma(x, k = 4, weighting = "exponential", maxgap = Inf) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{k}{integer width of the moving average window. Expands to both sides of the center element e.g. k=2 means 4 observations (2 left, 2 right) are taken into account. If all observations in the current window are NA, the window size is automatically increased until there are at least 2 non-NA values present.} \item{weighting}{Weighting to be used. Accepts the following input: \itemize{ \item{"simple" - Simple Moving Average (SMA)} \item{"linear" - Linear Weighted Moving Average (LWMA)} \item{"exponential" - Exponential Weighted Moving Average (EWMA)} (default choice) }} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Missing value replacement by weighted moving average. Uses semi-adaptive window size to ensure all NAs are replaced. } \details{ In this function missing values get replaced by moving average values. Moving Averages are also sometimes referred to as "moving mean", "rolling mean", "rolling average" or "running average". The mean in this implementation taken from an equal number of observations on either side of a central value. This means for an NA value at position \code{i} of a time series, the observations i-1,i+1 and i+1, i+2 (assuming a window size of k=2) are used to calculate the mean. Since it can in case of long NA gaps also occur, that all values next to the central value are also NA, the algorithm has a semi-adaptive window size. Whenever there are less than 2 non-NA values in the complete window available, the window size is incrementally increased, till at least 2 non-NA values are there. In all other cases the algorithm sticks to the pre-set window size. There are options for using Simple Moving Average (SMA), Linear Weighted Moving Average (LWMA) and Exponential Weighted Moving Average (EWMA). SMA: all observations in the window are equally weighted for calculating the mean. LWMA: weights decrease in arithmetical progression. The observations directly next to a central value i, have weight 1/2, the observations one further away (i-2,i+2) have weight 1/3, the next (i-3,i+3) have weight 1/4, ... EWMA: uses weighting factors which decrease exponentially. The observations directly next to a central value i, have weight 1/2^1, the observations one further away (i-2,i+2) have weight 1/2^2, the next (i-3,i+3) have weight 1/2^3, ... } \examples{ # Example 1: Perform imputation with simple moving average na_ma(tsAirgap, weighting = "simple") # Example 2: Perform imputation with exponential weighted moving average na_ma(tsAirgap) # Example 3: Perform imputation with exponential weighted moving average, window size 6 na_ma(tsAirgap, k = 6) # Example 4: Same as example 1, just written with pipe operator tsAirgap \%>\% na_ma(weighting = "simple") } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz } ================================================ FILE: man/na_mean.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_mean.R \name{na_mean} \alias{na_mean} \title{Missing Value Imputation by Mean Value} \usage{ na_mean(x, option = "mean", maxgap = Inf) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{option}{Algorithm to be used. Accepts the following input: \itemize{ \item{"mean" - take the mean for imputation (default choice)} \item{"median" - take the median for imputation} \item{"mode" - take the mode for imputation} \item{"harmonic" - take the harmonic mean} \item{"geometric" - take the geometric mean} }} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Missing value replacement by mean values. Different means like median, mean, mode possible. } \details{ Missing values get replaced by overall mean values. The function calculates the mean, median, mode, harmonic or geometric mean over all the non-NA values and replaces all NAs with this value. Option 'mode' replaces NAs with the most frequent value in the time series. If two or more values occur equally frequent, the function imputes the lower value. Due to their calculation formula geometric and harmonic mean are not well defined for negative values or zero values in the input series. In general using the mean for imputation imputation is mostly a suboptimal choice and should be handled with great caution. } \examples{ # Prerequisite: Create Time series with missing values x <- ts(c(2, 3, 4, 5, 6, NA, 7, 8)) # Example 1: Perform imputation with the overall mean na_mean(x) # Example 2: Perform imputation with overall median na_mean(x, option = "median") # Example 3: Same as example 1, just written with pipe operator x \%>\% na_mean() } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz } ================================================ FILE: man/na_random.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_random.R \name{na_random} \alias{na_random} \title{Missing Value Imputation by Random Sample} \usage{ na_random(x, lower_bound = NULL, upper_bound = NULL, maxgap = Inf) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{lower_bound}{Lower bound for the random samples. If nothing or NULL is set min(x) will be used.} \item{upper_bound}{Upper bound for the random samples. If nothing or NULL is set man(x) will be used.} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Replaces each missing value by drawing a random sample between two given bounds. } \details{ Replaces each missing value by drawing a random sample between two given bounds. The default bounds are the minimum and the maximum value in the non-NAs from the time series. Function uses \link{runif} function to get the random values. } \examples{ # Prerequisite: Create Time series with missing values x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8)) # Example 1: Replace all NAs by random values that are between min and max of the input time series na_random(x) # Example 2: Replace all NAs by random values between 1 and 10 na_random(x, lower_bound = 1, upper_bound = 10) # Example 3: Same as example 1, just written with pipe operator x \%>\% na_random() } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz } ================================================ FILE: man/na_remove.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_remove.R \name{na_remove} \alias{na_remove} \title{Remove Missing Values} \usage{ na_remove(x) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} } \value{ Vector (\code{\link{vector}}) } \description{ Removes all missing values from a time series. } \details{ Removes all missing values from a input time series. This shortens the time series by the number of missing values in the series. Should be handled with care, because this can affect the seasonality of the time series. Seasonal patterns might be destroyed. Independent from the input, this function only returns a vector. (the time information of a resulting time series object wouldn't be correct any more). } \examples{ # Example 1: Remove all NAs # Create Time series with missing values x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8)) # Example 1: Remove all NAs na_remove(x) # Example 2: Remove all NAs in tsAirgap na_remove(tsAirgap) # Example 3: Same as example 1, just written with pipe operator x \%>\% na_remove() } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz } ================================================ FILE: man/na_replace.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_replace.R \name{na_replace} \alias{na_replace} \title{Replace Missing Values by a Defined Value} \usage{ na_replace(x, fill = 0, maxgap = Inf) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{fill}{Value used to replace the missing values} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Replaces all missing values with a given value. } \examples{ # Prerequisite: Create Time series with missing values x <- ts(c(2, 3, NA, 5, 6, NA, 7, 8)) # Example 1: Replace all NAs with 3.5 na_replace(x, fill = 3.5) # Example 2: Replace all NAs with 0 na_replace(x, fill = 0) # Example 3: Same as example 1, just written with pipe operator x \%>\% na_replace(fill = 3.5) } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz } ================================================ FILE: man/na_seadec.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_seadec.R \name{na_seadec} \alias{na_seadec} \title{Seasonally Decomposed Missing Value Imputation} \usage{ na_seadec( x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ... ) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{algorithm}{Algorithm to be used after decomposition. Accepts the following input: \itemize{ \item{"interpolation" - Imputation by Interpolation} (default choice) \item{"locf" - Imputation by Last Observation Carried Forward} \item{"mean" - Imputation by Mean Value} \item{"random" - Imputation by Random Sample} \item{"kalman" - Imputation by Kalman Smoothing and State Space Models} \item{"ma" - Imputation by Weighted Moving Average} }} \item{find_frequency}{If TRUE the algorithm will try to estimate the frequency of the time-series automatically.} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} \item{...}{Additional parameters for these algorithms that can be passed through. Look at \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_mean}} for parameter options.} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Removes the seasonal component from the time series, performs imputation on the deseasonalized series and afterwards adds the seasonal component again. } \details{ The algorithm first performs a Seasonal Decomposition of Time Series by Loess via \code{\link[stats]{stl}}. Decomposing the time series into seasonal, trend and irregular components. The seasonal component gets then removed (subtracted) from the original series. As a second step the selected imputation algorithm e.g. na_locf, na_ma, ... is applied on the deseasonalized series. Thus, the algorithm can work without being affected by seasonal patterns. After filling the NA gaps, the seasonal component is added to the deseasonalized series again. Implementation details: A paper about the STL Decomposition procedure is linked in the references. Since the function only works with complete data, the initial NA data is temporarily filled via linear interpolation in order to perform the decomposition. These temporarily imputed values are replaced with NAs again after obtaining the decomposition for the non-NA observations. STL decomposition is run with robust = TRUE and s.window = 11. Additionally, applying STL decomposition needs a preset frequency. This can be passed by the frequency set in the input ts object or by setting 'find_frequency=TRUE' in order to find an appropriate frequency for the time series. The find_frequency parameter internally uses \code{\link[forecast]{findfrequency}}, which does a spectral analysis of the time series for identifying a suitable frequency. Using find_frequency will update the previously set frequency of a ts object to the newly found frequency. The default is 'find_frequency = FALSE', which gives a warning if no seasonality is set for the supplied time series object. If neither seasonality is set nor find_frequency is set to TRUE, the function goes on without decomposition and just applies the selected secondary algorithm to the original time series that still includes seasonality. } \examples{ # Example 1: Perform seasonal imputation using algorithm = "interpolation" na_seadec(tsAirgap, algorithm = "interpolation") # Example 2: Perform seasonal imputation using algorithm = "mean" na_seadec(tsAirgap, algorithm = "mean") # Example 3: Same as example 1, just written with pipe operator tsAirgap \%>\% na_seadec(algorithm = "interpolation") } \references{ R. B. Cleveland, W. S. Cleveland, J.E. McRae, and I. Terpenning (1990) STL: A Seasonal-Trend Decomposition Procedure Based on Loess. Journal of Official Statistics, 6, 3–73. } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seasplit}} } \author{ Steffen Moritz } ================================================ FILE: man/na_seasplit.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/na_seasplit.R \name{na_seasplit} \alias{na_seasplit} \title{Seasonally Splitted Missing Value Imputation} \usage{ na_seasplit( x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ... ) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object in which missing values shall be replaced} \item{algorithm}{Algorithm to be used after splits. Accepts the following input: \itemize{ \item{"interpolation" - Imputation by Interpolation} (default choice) \item{"locf" - Imputation by Last Observation Carried Forward} \item{"mean" - Imputation by Mean Value} \item{"random" - Imputation by Random Sample} \item{"kalman" - Imputation by Kalman Smoothing and State Space Models} \item{"ma" - Imputation by Weighted Moving Average} }} \item{find_frequency}{If TRUE the algorithm will try to estimate the frequency of the time-series automatically.} \item{maxgap}{Maximum number of successive NAs to still perform imputation on. Default setting is to replace all NAs without restrictions. With this option set, consecutive NAs runs, that are longer than 'maxgap' will be left NA. This option mostly makes sense if you want to treat long runs of NA afterwards separately.} \item{...}{Additional parameters for these algorithms that can be passed through. Look at \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_mean}} for parameter options.} } \value{ Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object (dependent on given input at parameter x) } \description{ Splits the times series into seasons and afterwards performs imputation separately for each of the resulting time series datasets (each containing the data for one specific season). } \examples{ # Example 1: Perform seasonal splitted imputation using algorithm = "interpolation" na_seasplit(tsAirgap, algorithm = "interpolation") # Example 2: Perform seasonal splitted imputation using algorithm = "mean" na_seasplit(tsAirgap, algorithm = "mean") # Example 3: Same as example 1, just written with pipe operator tsAirgap \%>\% na_seasplit(algorithm = "interpolation") } \seealso{ \code{\link[imputeTS]{na_interpolation}}, \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}}, \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}}, \code{\link[imputeTS]{na_seadec}} } \author{ Steffen Moritz } ================================================ FILE: man/plotNA.distribution.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{plotNA.distribution} \alias{plotNA.distribution} \title{Discontinued - Use \code{\link[imputeTS]{ggplot_na_distribution}} instead.} \usage{ plotNA.distribution(x, ...) } \description{ plotNA.distribution was replaced by \code{\link[imputeTS]{ggplot_na_distribution}}. The new plotting function provides an improved version of the old plot, e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions. } \keyword{internal} ================================================ FILE: man/plotNA.distributionBar.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{plotNA.distributionBar} \alias{plotNA.distributionBar} \title{Discontinued - Use \code{\link[imputeTS]{ggplot_na_distribution2}} instead.} \usage{ plotNA.distributionBar(x, ...) } \description{ plotNA.distributionBar was replaced by \code{\link[imputeTS]{ggplot_na_distribution2}}. The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions. } \keyword{internal} ================================================ FILE: man/plotNA.gapsize.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{plotNA.gapsize} \alias{plotNA.gapsize} \title{Discontinued - Use \code{\link[imputeTS]{ggplot_na_gapsize}} instead.} \usage{ plotNA.gapsize(x, ...) } \description{ plotNA.gapsize was replaced by \code{\link[imputeTS]{ggplot_na_gapsize}}. The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions. } \keyword{internal} ================================================ FILE: man/plotNA.imputations.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/deprecated_defunct.R \name{plotNA.imputations} \alias{plotNA.imputations} \title{Discontinued - Use \code{\link[imputeTS]{ggplot_na_imputations}} instead.} \usage{ plotNA.imputations(x, ...) } \description{ plotNA.imputations was replaced by \code{\link[imputeTS]{ggplot_na_imputations}}. The new plotting function provides an improved version of the old plot e.g. it looks better now and is better adjustable, because it is based on ggplot2. If you absolutely want to use the old function, you need to download an older package version. Versions 3.0 and below still have the old functions. } \keyword{internal} ================================================ FILE: man/reexports.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/imputeTS-package.R \docType{import} \name{reexports} \alias{reexports} \alias{\%>\%} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} }} ================================================ FILE: man/statsNA.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/statsNA.R \name{statsNA} \alias{statsNA} \title{Print Statistics about Missing Values} \usage{ statsNA(x, bins = 4, print_only = TRUE) } \arguments{ \item{x}{Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}}) object containing NAs} \item{bins}{Split number for bin stats. Number of bins the time series gets divided into. For each bin information about amount/percentage of missing values is printed. Default value is 4 - what means stats about the 1st,2nd,3rd,4th quarter of the time series are shown.} \item{print_only}{Choose if the function Prints or Returns. For print_only = TRUE the function has no return value and just prints out missing value stats. If print_only is changed to FALSE, nothing is printed and the function returns a list.Print gives a little bit more information, since the returned list does not include "Stats for Bins" and "overview NA series"} } \value{ A \code{\link{list}} containing the stats. Beware: Function gives only a return value if print_only = FALSE. } \description{ Print summary stats about the distribution of missing values in a univariate time series. } \details{ Prints the following information about the missing values in the time series: \itemize{ \item{"Length of time series" - Number of observations in the time series (including NAs)} \item{"Number of Missing Values" - Number of missing values in the time series} \item{"Percentage of Missing Values" - Percentage of missing values in the time series} \item{"Number of Gaps" - Number of NA gaps (consisting of one or more consecutive NAs) in the time series} \item{"Average Gap Size" - Average size of consecutive NAs for the NA gaps in the time series} \item{"Stats for Bins" - Number/percentage of missing values for the split into bins } \item{"Longest NA gap" - Longest series of consecutive missing values (NAs in a row) in the time series } \item{"Most frequent gap size" - Most frequent occurring series of missing values in the time series} \item{"Gap size accounting for most NAs" - The series of consecutive missing values that accounts for most missing values overall in the time series} \item{"Overview NA series" - Overview about how often each series of consecutive missing values occurs. Series occurring 0 times are skipped} } It is furthermore, important to note, that you are able to choose whether the function returns a list or prints the information only. (see description of parameter "print_only") } \examples{ # Example 1: Print stats about the missing data in tsNH4 statsNA(tsNH4) # Example 2: Return list with stats about the missing data in tsAirgap statsNA(tsAirgap, print_only = FALSE) # Example 3: Same as example 1, just written with pipe operator tsNH4 \%>\% statsNA() } \seealso{ \code{\link[imputeTS]{ggplot_na_distribution}}, \code{\link[imputeTS]{ggplot_na_distribution2}}, \code{\link[imputeTS]{ggplot_na_gapsize}} } \author{ Steffen Moritz } ================================================ FILE: man/tsAirgap.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tsAirgap.R \docType{data} \name{tsAirgap} \alias{tsAirgap} \title{Time series of monthly airline passengers (with NAs)} \format{ Time Series (\code{\link{ts}}) with 144 rows including 13 NAs. } \source{ \cite{Box, G. E. P., Jenkins, G. M., Reinsel, G. C. and Ljung, G. M. (2015). Time series analysis: forecasting and control. Fifth Edition. John Wiley and Sons.} } \usage{ tsAirgap } \description{ Monthly totals of international airline passengers, 1949 to 1960. This time series contains missing values. In the package included is also the \code{\link{tsAirgapComplete}} time series providing the true values for the missing values. } \details{ The dataset originates from Box and Jenkins (see citation) and is a commonly used example in time series analysis literature. It characteristics (strong trend, strong seasonal behavior) make it also a great example for time series imputation. Thus the version with inserted NA gaps was created under the name tsAirgap. In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. There are the two time series: \itemize{ \item tsAirgap - The time series with NAs. \item tsAirgapComplete - Time series without NAs. } } \seealso{ \code{\link[imputeTS]{tsHeating}}, \code{\link[imputeTS]{tsNH4}} } \keyword{datasets} ================================================ FILE: man/tsAirgapComplete.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tsAirgapComplete.R \docType{data} \name{tsAirgapComplete} \alias{tsAirgapComplete} \title{Time series of monthly airline passengers (complete)} \format{ Time Series (\code{\link{ts}}) with 144 rows. } \source{ \cite{Box, G. E. P., Jenkins, G. M., Reinsel, G. C. and Ljung, G. M. (2015). Time series analysis: forecasting and control. Fifth Edition. John Wiley and Sons.} } \usage{ tsAirgapComplete } \description{ Monthly totals of international airline passengers, 1949 to 1960. This time series provides the truth for the missing values of the \code{\link{tsAirgap}} time series. Thus it is identical to the tsAirgap time series except that no value is missing. } \details{ The dataset originates from Box and Jenkins (see citation) and is a commonly used example in time series analysis literature. It characteristics (strong trend, strong seasonal behavior) make it also a great example for time series imputation. Thus the version with inserted NA gaps was created under the name tsAirgap. In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. There are the two time series: \itemize{ \item tsAirgap - The time series with NAs. \item tsAirgapComplete - Time series without NAs. } } \seealso{ \code{\link[imputeTS]{tsHeating}}, \code{\link[imputeTS]{tsNH4}} } \keyword{datasets} ================================================ FILE: man/tsHeating.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tsHeating.R \docType{data} \name{tsHeating} \alias{tsHeating} \title{Time series of a heating systems supply temperature (with NAs)} \format{ Time Series (\code{\link{ts}}) with 606837 rows including 57391 NAs. } \source{ \cite{Moritz, Steffen, Friese, Martina, Fischbach, Andreas, Schlitt, Christopher, and Bartz-Beielstein, Thomas. (2015, May 1). GECCO Industrial Challenge 2015 Dataset: A heating system dataset for the 'Recovering missing information in heating system operating data' competition at the Genetic and Evolutionary Computation Conference 2015, Madrid, Spain. http://doi.org/10.5281/zenodo.3884899 } } \usage{ tsHeating } \description{ Time series of a heating systems supply temperature. Measured from 18.11.2013 - 05:12:00 to 13.01.2015 - 15:08:00 in 1 minute steps. This time series contains missing values. In the package included is also the \code{\link{tsHeatingComplete}} time series providing the true values for the missing values. } \details{ The time series originates from the GECCO Industrial Challenge 2015. This Challenge was about "Recovering missing information in heating system operating data". Goal was to impute missing values in heating system sensor data as accurate as possible. (\doi{10.5281/zenodo.3884899}) In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series. There are the two time series: \itemize{ \item tsHeating - The time series with NAs. \item tsHeatingComplete - Time series without NAs. } } \seealso{ \code{\link[imputeTS]{tsAirgap}}, \code{\link[imputeTS]{tsNH4}} } \keyword{datasets} ================================================ FILE: man/tsHeatingComplete.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tsHeatingComplete.R \docType{data} \name{tsHeatingComplete} \alias{tsHeatingComplete} \title{Time series of a heating systems supply temperature (complete)} \format{ Time Series (\code{\link{ts}}) with 606837 rows. } \source{ \cite{Moritz, Steffen, Friese, Martina, Fischbach, Andreas, Schlitt, Christopher, and Bartz-Beielstein, Thomas. (2015, May 1). GECCO Industrial Challenge 2015 Dataset: A heating system dataset for the 'Recovering missing information in heating system operating data' competition at the Genetic and Evolutionary Computation Conference 2015, Madrid, Spain. http://doi.org/10.5281/zenodo.3884899 } } \usage{ tsHeatingComplete } \description{ Time series of a heating systems supply temperature. Measured from 18.11.2013 - 05:12:00 to 13.01.2015 - 15:08:00 in 1 minute steps. This time series provides the truth for the missing values of the \code{\link{tsHeating}} time series. Thus it is identical to the heating time series except that no value is missing. } \details{ The time series originates from the GECCO Industrial Challenge 2015. This Challenge was about "Recovering missing information in heating system operating data". Goal was to impute missing values in heating system sensor data as accurate as possible. (\doi{10.5281/zenodo.3884899}) In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series. There are the two time series: \itemize{ \item tsHeating - The time series with NAs. \item tsHeatingComplete - Time series without NAs. } } \seealso{ \code{\link[imputeTS]{tsAirgap}}, \code{\link[imputeTS]{tsNH4}} } \keyword{datasets} ================================================ FILE: man/tsNH4.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tsNH4.R \docType{data} \name{tsNH4} \alias{tsNH4} \title{Time series of NH4 concentration in a wastewater system (with NAs)} \format{ Time Series (\code{\link{ts}}) with 4552 rows including 883 NAs. } \source{ \cite{Friese, Martina, Fischbach, Andreas, Flasch, Oliver, Mersmann, Olaf, Bartz-Beielstein, Thomas, and Walbeck, Klaus. (2014, July 16). GECCO Industrial Challenge 2014 Dataset: A water quality dataset for the 'Active protection against pollution of the surface water' competition at the Genetic and Evolutionary Computation Conference 2015, Vancouver, Canada. http://www.spotseven.de/gecco-challenge/gecco-challenge-2014} } \usage{ tsNH4 } \description{ Time series of NH4 concentration in a wastewater system. Measured from 30.11.2010 - 16:10 to 01.01.2011 - 6:40 in 10 minute steps. This time series contains missing values. In the package included is also the \code{\link{tsNH4Complete}} time series providing the true values for the missing values. } \details{ The time series is derived from the dataset of the GECCO Industrial Challenge 2014. In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series. There are the two time series: \itemize{ \item tsNH4 - The time series with NAs. \item tsNH4Complete - Time series without NAs. } } \seealso{ \code{\link[imputeTS]{tsAirgap}},\code{\link[imputeTS]{tsHeating}} } \keyword{datasets} ================================================ FILE: man/tsNH4Complete.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tsNH4Complete.R \docType{data} \name{tsNH4Complete} \alias{tsNH4Complete} \title{Time series of NH4 concentration in a wastewater system (complete)} \format{ Time Series (\code{\link{ts}}) with 4552 rows. } \source{ \cite{Friese, Martina, Fischbach, Andreas, Flasch, Oliver, Mersmann, Olaf, Bartz-Beielstein, Thomas, and Walbeck, Klaus. (2014, July 16). GECCO Industrial Challenge 2014 Dataset: A water quality dataset for the 'Active protection against pollution of the surface water' competition at the Genetic and Evolutionary Computation Conference 2015, Vancouver, Canada. http://www.spotseven.de/gecco-challenge/gecco-challenge-2014}#' } \usage{ tsNH4Complete } \description{ Time series of NH4 concentration in a wastewater system. Measured from 30.11.2010 - 16:10 to 01.01.2011 - 6:40 in 10 minute steps. This time series provides the truth for the missing values of the \code{\link{tsNH4}} time series. Thus it is identical to the heating time series except that no value is missing. } \details{ The time series is derived from the dataset of the GECCO Industrial Challenge 2014. In order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values, which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series. There are the two time series: \itemize{ \item tsNH4 - The time series with NAs. \item tsNH4Complete - Time series without NAs. } } \seealso{ \code{\link[imputeTS]{tsAirgap}},\code{\link[imputeTS]{tsHeating}} } \keyword{datasets} ================================================ FILE: src/RcppExports.cpp ================================================ // Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; #ifdef RCPP_USE_GLOBAL_ROSTREAM Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // locf Rcpp::NumericVector locf(NumericVector x, bool reverse); RcppExport SEXP _imputeTS_locf(SEXP xSEXP, SEXP reverseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type x(xSEXP); Rcpp::traits::input_parameter< bool >::type reverse(reverseSEXP); rcpp_result_gen = Rcpp::wrap(locf(x, reverse)); return rcpp_result_gen; END_RCPP } // ma Rcpp::NumericVector ma(NumericVector x, int k, String weighting); RcppExport SEXP _imputeTS_ma(SEXP xSEXP, SEXP kSEXP, SEXP weightingSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type x(xSEXP); Rcpp::traits::input_parameter< int >::type k(kSEXP); Rcpp::traits::input_parameter< String >::type weighting(weightingSEXP); rcpp_result_gen = Rcpp::wrap(ma(x, k, weighting)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_imputeTS_locf", (DL_FUNC) &_imputeTS_locf, 2}, {"_imputeTS_ma", (DL_FUNC) &_imputeTS_ma, 3}, {NULL, NULL, 0} }; RcppExport void R_init_imputeTS(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } ================================================ FILE: src/locf.cpp ================================================ #include using namespace Rcpp; // [[Rcpp::export]] Rcpp::NumericVector locf(NumericVector x, bool reverse) { Rcpp::NumericVector z = clone(x); long n = z.size(); if (!reverse) { for(long i = 0; i < n; i++ ) { if (i % 1024 == 0) {Rcpp::checkUserInterrupt();} if(i > 0 && !R_finite(z[i]) && R_finite(z[i-1])) { z[i] = z[i-1]; } } } else { for(long i = n-1; i >= 0; i-- ) { if (i % 1024 == 0) {Rcpp::checkUserInterrupt();} if(i < n-1 && !R_finite(z[i]) && R_finite(z[i+1])) { z[i] = z[i+1]; } } } return z; } ================================================ FILE: src/ma.cpp ================================================ #include using namespace Rcpp; struct pow_wrapper { public: double operator()(double a, double b) { return ::pow(a, b); } }; NumericVector vecpow(const IntegerVector base, const NumericVector exp) { NumericVector out(base.size()); std::transform(base.cbegin(), base.cend(), exp.cbegin(), out.begin(), pow_wrapper()); return out; } // [[Rcpp::export]] Rcpp::NumericVector ma(NumericVector x, int k, String weighting) { Rcpp::NumericVector tempdata = clone(x); Rcpp::NumericVector out = clone(x); int n = tempdata.size(); for (int i = 0; i < n; i++ ) { // check for interrupt every 1024 iterations if (i % 1024 == 0) {Rcpp::checkUserInterrupt();} // If Value is NA -> impute it based on selected method if (ISNAN(tempdata[i])) { int ktemp = k; IntegerVector usedIndices = seq(i - ktemp, i + ktemp); usedIndices = usedIndices[usedIndices >= 0]; usedIndices = usedIndices[usedIndices < n]; NumericVector t = tempdata[usedIndices]; // Search for at least 2 NA-values while (sum(!is_na(t)) < 2) { ktemp = ktemp + 1; usedIndices = seq(i - ktemp, i + ktemp); usedIndices = usedIndices[usedIndices >= 0]; usedIndices = usedIndices[usedIndices < n]; t = tempdata[usedIndices]; } if (weighting =="simple") { // Calculate mean value NumericVector noNAs = wrap(na_omit(t)); out[i] = mean(noNAs); } else if(weighting == "linear") { // Calculate weights based on indices 1/(distance from current index+1) // Set weights where data is NA to 0 // Sum up all weights (needed later) to norm it // Create weighted data (weights*data) // Sum up NumericVector weightsData = 1 / (abs(usedIndices - i) + 1); LogicalVector naCheck = !is_na(t); weightsData = weightsData * as(naCheck); double sumWeights = sum(weightsData); NumericVector weightedData = (t * weightsData) / sumWeights; NumericVector noNAs = wrap(na_omit(weightedData)); out[i] = sum(noNAs); } else if (weighting == "exponential") { // Calculate weights based on indices 1/ 2 ^ (distance from current index) // Set weights where data is NA to 0 // Sum up all weights (needed later) to norm it // Create weighted data (weights*data) // Sum up NumericVector expo = abs(usedIndices - i); IntegerVector base = Rcpp::rep(2, expo.size()); NumericVector weightsData = 1 / (vecpow(base, expo)); LogicalVector naCheck = !is_na(t); weightsData = weightsData * as(naCheck); double sumWeights = sum(weightsData); NumericVector weightedData = (t * weightsData) / sumWeights; NumericVector noNAs = wrap(na_omit(weightedData)); out[i] = sum(noNAs); } else { stop("Wrong input for parameter weighting. Has to be \"simple\",\"linear\" or \"exponential\"." ); } } } return out; } ================================================ FILE: tests/testthat/test-apply_base_algorithm.R ================================================ context("apply_base_algorithm") test_that("Warning for wrong algorithm choice", { expect_error( apply_base_algorithm(tsAirgap, algorithm = "wrongAlgorithm") ) }) ================================================ FILE: tests/testthat/test-depreciated_defunct.R ================================================ context("Defunct and Depreciated Functions") test_that("Correct error for old, defunct plotting functions", { expect_error(plotNA.distribution(tsAirgap), class = "defunctError") expect_error(plotNA.distributionBar(tsAirgap), class = "defunctError") expect_error(plotNA.gapsize(tsAirgap), class = "defunctError") expect_error(plotNA.imputations(tsAirgap), class = "defunctError") }) test_that("Correct warning for old, deprieciated imputation functions", { expect_warning(na.interpolation(tsAirgap), regexp = "replaced by") expect_warning(na.kalman(tsAirgap), regexp = "replaced by") expect_warning(na.locf(tsAirgap), regexp = "replaced by") expect_warning(na.ma(tsAirgap), regexp = "replaced by") expect_warning(na.mean(tsAirgap), regexp = "replaced by") expect_warning(na.random(tsAirgap), regexp = "replaced by") expect_warning(na.remove(tsAirgap), regexp = "replaced by") expect_warning(na.replace(tsAirgap), regexp = "replaced by") expect_warning(na.seadec(tsAirgap), regexp = "replaced by") expect_warning(na.seasplit(tsAirgap), regexp = "replaced by") }) ================================================ FILE: tests/testthat/test-error_handling.R ================================================ context("error-handling") # These test are to make sure, the right errors / warnings are given # for wrong input data or wrong parameter inputs # Especially important is the error handling for multivariate inputs like data.frames test_that("Too few non-NA data points", { x <- c(NA, NA, NA, NA, NA, NA, NA) expect_error(na_seasplit(x), regexp = "At least") expect_error(na_seadec(x), regexp = "At least") expect_error(na_random(x), regexp = "At least") expect_error(na_mean(x), regexp = "At least") expect_error(na_ma(x), regexp = "At least") expect_error(na_locf(x), regexp = "At least") expect_error(na_kalman(x), regexp = "At least") expect_error(na_interpolation(x), regexp = "At least") }) test_that("Data not numeric", { x1 <- rep("string",144) x1[3:15] <- NA expect_error(na_seasplit(x1), regexp = "not numeric") expect_error(na_seadec(x1), regexp = "not numeric") expect_error(na_random(x1), regexp = "not numeric") expect_error(na_mean(x1), regexp = "not numeric") expect_error(na_ma(x1), regexp = "not numeric") expect_error(na_locf(x1), regexp = "not numeric") expect_error(na_kalman(x1), regexp = "not numeric") expect_error(na_interpolation(x1), regexp = "not numeric") }) test_that("Correct error messages for multiveriate inputs", { x1 <- rep("string",144) x1[3:15] <- NA x2 <- rep(3,144) x2[3] <- NA xyz <- data.frame(tsAirgap, x1, tsAirgap, x2) expect_warning(na_seasplit(xyz), regexp = "na_seasplit: No imputation performed for column 2") expect_warning(na_seasplit(xyz), regexp = "No seasonality information for dataset could be found") expect_warning(na_seadec(xyz), regexp = "na_seadec: No imputation performed for column 2") expect_warning(na_seadec(xyz), regexp = "No seasonality information for dataset could be found") expect_warning(na_random(xyz), regexp = "na_random: No imputation performed for column 2") expect_warning(na_mean(xyz), regexp = "na_mean: No imputation performed for column 2") expect_warning(na_ma(xyz), regexp = "na_ma: No imputation performed for column 2") expect_warning(na_locf(xyz), regexp = "na_locf: No imputation performed for column 2") expect_warning(na_kalman(xyz), regexp = "na_kalman: No imputation performed for column 2") expect_warning(na_interpolation(xyz), regexp = "na_interpolation: No imputation performed for column 2") }) ================================================ FILE: tests/testthat/test-ggplot_na_distribution.R ================================================ context("ggplot_na_distribution") test_that("Old functions give error", { expect_error(plotNA.distribution(tsAirgap)) }) test_that("Check that all parameters of plot run without error", { if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } { require("ggplot2") expect_true(is_ggplot(ggplot_na_distribution(tsAirgap))) expect_true(is_ggplot(ggplot_na_distribution(tsAirgap, color_points = "blue", color_lines = "gold", color_missing = "darkgreen", color_missing_border = "green", alpha_missing = 0.1, title = "test", subtitle = "test", xlab = "test", ylab = "test", shape_points = 15, size_points = 2, theme = ggplot2::theme_classic() ))) } }) test_that("Errors for wrong input", { ## input not univariate x <- data.frame( x = runif(10, 0, 10), y = runif(10, 0, 10) ) expect_error(ggplot_na_distribution(x)) ## input not numeric x <- c("a", 1, 2, 3) expect_error(ggplot_na_distribution(x)) # input only NA all_na <- as.numeric(c(NA, NA, NA, NA, NA, NA, NA, NA)) expect_error(ggplot_na_distribution(all_na)) }) test_that("Works with tsNH4", { expect_true(is_ggplot( ggplot_na_distribution(tsNH4) )) }) test_that("Plot with x_axis_labels works and yearly data works", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else { require("zoo") require("ggplot2") # Yearly data nh <- structure(c( NA, NA, 49.4, 51.1, 49.4, 47.9, 49.8, 50.9, 49.3, 51.9, 50.8, 49.6, 49.3, 50.6, 48.4, 50.7, 50.9, 50.6, 51.5, 52.8, 51.8, 51.1, 49.8, 50.2, 50.4, 51.6, 51.8, 50.9, 48.8, 51.7, 51, 50.6, 51.7, 51.5, 52.1, 51.3, 51, 54, 51.4, 52.7, 53.1, 54.6, 52, 52, 50.9, 52.6, 50.2, 52.6, 51.6, 51.9, 50.5, 50.9, 51.7, 51.4, 51.7, 50.8, 51.9, 51.8, NA, NA ), .Tsp = c( 1912, 1971, 1 ), class = "ts") # Use zoo to change ts time information to yearmon vector # Afterwards create Date vector and from this Date vector POSIXct nh_yearmon <- zoo::as.yearmon(time(nh)) nh_date <- zoo::as.Date(nh_yearmon) nh_posix <- as.POSIXct(nh_date) expect_true(is_ggplot( ggplot_na_distribution(nh) )) expect_true(is_ggplot( ggplot_na_distribution(nh, x_axis_labels = nh_date) )) expect_true(is_ggplot( ggplot_na_distribution(nh, x_axis_labels = nh_posix) )) expect_true(is_ggplot( ggplot_na_distribution(nh, x_axis_labels = nh_posix, title = "test") )) expect_true(is_ggplot( ggplot_na_distribution(nh, x_axis_labels = nh_posix, title = "test") + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) )) } }) test_that("Plot with x_axis_labels works and tsAirgap data works", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else { require("zoo") require("ggplot2") # Use zoo to change ts time information to yearmon vector # Afterwards create Date vector and from this Date vector POSIXct airgap_yearmon <- zoo::as.yearmon(time(tsAirgap)) airgap_date <- zoo::as.Date(airgap_yearmon) airgap_posix <- as.POSIXct(airgap_yearmon) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap) )) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap, x_axis_labels = airgap_date) )) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap, x_axis_labels = airgap_posix) )) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap, x_axis_labels = airgap_date) + ggplot2::scale_x_date(date_breaks = "6 month", date_labels = "%m-%Y") + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) + ggtitle("hjsdhs") )) } }) test_that("Non standard input - data.frame, tsibble, tibble, zoo", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tibble", quietly = TRUE)) { warning("Pkg tibble needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tsibble", quietly = TRUE)) { warning("Pkg tsibble needed for this test.", call. = FALSE ) } else { require("zoo") require("ggplot2") require("tibble") require("tsibble") # data.frame tsAirgap_df <- data.frame(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap_df) )) # data.frame - multivariate - supposed to give Error tsAirgap_df2 <- data.frame(tsAirgap, tsAirgap) expect_error( ggplot_na_distribution(tsAirgap_df2) ) # zoo and theme adjustment tsAirgap_zoo <- zoo::as.zoo(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap_zoo) + ggplot2::theme_minimal() )) # zoo - multivariate - supposed to give Error tsAirgap_zoo2 <- zoo(cbind(tsAirgap, tsAirgap), zoo::as.Date(zoo::as.yearmon(time(tsAirgap)))) expect_error( ggplot_na_distribution(tsAirgap_zoo2) ) # tsibble tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap_tsibble) )) # tsibble just value, theme adjustment tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap_tsibble$value) + ggplot2::theme_minimal() )) # tsibble multivariate - plots first non index variable (maybe error would be better) tsAirgap_tsibble2 <- tsibble::as_tsibble(tsAirgap) tsAirgap_tsibble2$var2 <- tsAirgap expect_true(is_ggplot( ggplot_na_distribution(tsAirgap_tsibble2) )) # tibble tsAirgap_tibble <- tibble::as_tibble(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap_tibble) )) # tibble multivariate - plots first variable (maybe error would be better) tsAirgap_tibble2 <- tibble::as_tibble(data.frame(tsAirgap, tsAirgap)) expect_true(is_ggplot( ggplot_na_distribution(tsAirgap_tibble2) )) } }) ================================================ FILE: tests/testthat/test-ggplot_na_distribution2.R ================================================ context("ggplot_na_distribution2") test_that("Old functions give error", { expect_error(plotNA.distributionBar(tsAirgap)) }) test_that("Old name of function gives error", { expect_error(ggplot_na_intervals(tsAirgap)) }) test_that("Check that all parameters of plot run without error", { if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") expect_true(is_ggplot(ggplot_na_distribution2(tsAirgap))) expect_true(is_ggplot(ggplot_na_distribution2(tsAirgap, number_intervals = 8))) expect_true(is_ggplot(ggplot_na_distribution2(tsAirgap, interval_size = 25))) expect_true(is_ggplot(ggplot_na_distribution2(tsNH4, measure = "count"))) expect_true(is_ggplot(ggplot_na_distribution2(tsAirgap, color_missing = "blue", color_existing = "yellow" ))) expect_true(is_ggplot(ggplot_na_distribution2(tsAirgap, alpha_missing = 1, alpha_existing = 1 ))) expect_true(is_ggplot(ggplot_na_distribution2(tsAirgap, title = "Test", subtitle = "test", ylab = "test", xlab = "test" ))) expect_true(is_ggplot(ggplot_na_distribution2(tsAirgap, color_border = "black"))) expect_true(is_ggplot(ggplot_na_distribution2(tsAirgap, theme = ggplot2::theme_classic()))) } }) test_that("Errors for wrong input", { ## input not univariate x <- data.frame( x = runif(10, 0, 10), y = runif(10, 0, 10) ) expect_error(ggplot_na_distribution2(x)) ## input not numeric x <- c("a", 1, NA, 3) expect_error(ggplot_na_distribution2(x)) all_na <- as.numeric(c(NA, NA, NA, NA, NA, NA, NA, NA)) expect_error(ggplot_na_distribution2(all_na)) }) test_that("Plot works with test ts", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") # Yearly data nh <- structure(c( NA, NA, 49.4, 51.1, 49.4, 47.9, 49.8, 50.9, 49.3, 51.9, 50.8, 49.6, 49.3, 50.6, 48.4, 50.7, 50.9, 50.6, 51.5, 52.8, 51.8, 51.1, 49.8, 50.2, 50.4, 51.6, 51.8, 50.9, 48.8, 51.7, 51, 50.6, 51.7, 51.5, 52.1, 51.3, 51, 54, 51.4, 52.7, 53.1, 54.6, 52, 52, 50.9, 52.6, 50.2, 52.6, 51.6, 51.9, 50.5, 50.9, 51.7, 51.4, 51.7, 50.8, 51.9, 51.8, NA, NA ), .Tsp = c( 1912, 1971, 1 ), class = "ts") expect_true(is_ggplot( ggplot_na_distribution2(nh) )) expect_true(is_ggplot( ggplot_na_distribution2(nh, title = "test") )) expect_true(is_ggplot( ggplot_na_distribution2(nh, title = "test") + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) )) } }) test_that(" tsNH4 data works", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") expect_true(is_ggplot( ggplot_na_distribution2(tsNH4) )) expect_true(is_ggplot( ggplot_na_distribution2(tsNH4) + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) + ggplot2::ggtitle("hjsdhs") )) } }) test_that("Non standard input - data.frame, tsibble, tibble, zoo", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tibble", quietly = TRUE)) { warning("Pkg tibble needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tsibble", quietly = TRUE)) { warning("Pkg tsibble needed for this test.", call. = FALSE ) } else { require("zoo") require("ggplot2") require("tibble") require("tsibble") # data.frame tsAirgap_df <- data.frame(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution2(tsAirgap_df) )) # data.frame - multivariate - supposed to give Error tsAirgap_df2 <- data.frame(tsAirgap, tsAirgap) expect_error( ggplot_na_distribution2(tsAirgap_df2) ) # zoo and theme adjustment tsAirgap_zoo <- zoo::as.zoo(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution2(tsAirgap_zoo) + ggplot2::theme_minimal() )) # zoo - multivariate - supposed to give Error tsAirgap_zoo2 <- zoo(cbind(tsAirgap, tsAirgap), zoo::as.Date(zoo::as.yearmon(time(tsAirgap)))) expect_error( ggplot_na_distribution2(tsAirgap_zoo2) ) # tsibble tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution2(tsAirgap_tsibble) )) # tsibble just value, theme adjustment tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution2(tsAirgap_tsibble$value) + ggplot2::theme_minimal() )) # tsibble multivariate - plots first non index variable (maybe error would be better) tsAirgap_tsibble2 <- tsibble::as_tsibble(tsAirgap) tsAirgap_tsibble2$var2 <- tsAirgap expect_true(is_ggplot( ggplot_na_distribution2(tsAirgap_tsibble2) )) # tibble tsAirgap_tibble <- tibble::as_tibble(tsAirgap) expect_true(is_ggplot( ggplot_na_distribution2(tsAirgap_tibble) )) # tibble multivariate - plots first variable (maybe error would be better) tsAirgap_tibble2 <- tibble::as_tibble(data.frame(tsAirgap, tsAirgap)) expect_true(is_ggplot( ggplot_na_distribution2(tsAirgap_tibble2) )) } }) ================================================ FILE: tests/testthat/test-ggplot_na_gapsize.R ================================================ context("ggplot_na_gapsize") test_that("Old functions give error", { expect_error(plotNA.gapsize(tsAirgap)) }) test_that("Check that all parameters of plot run without error", { if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, ranked_by = "total"))) expect_true(is_ggplot(ggplot_na_gapsize(tsNH4, limit = 2))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, legend = F))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, orientation = "horizontal"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, include_total = F))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, color_occurrence = "blue"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, limit = 1))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, include_total = F))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, ranked_by = "total"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, color_occurrence = "gold"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, color_total = "green"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, title = "test"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, subtitle = "test2"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, xlab = "test"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, ylab = "test"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, orientation = "vertical"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, label_occurrence = "occ"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, label_total = "total"))) expect_true(is_ggplot(ggplot_na_gapsize(tsAirgap, theme = ggplot2::theme_classic()))) } }) test_that("Errors for wrong input", { ## input not univariate x <- data.frame( x = runif(10, 0, 10), y = runif(10, 0, 10) ) expect_error(ggplot_na_gapsize(x)) ## input not numeric x <- c("a", 1, NA, 3) expect_error(ggplot_na_gapsize(x)) ## No NA values x <- 1:10 expect_error(ggplot_na_gapsize(x)) all_na <- as.numeric(c(NA, NA, NA, NA, NA, NA, NA, NA)) expect_error(ggplot_na_gapsize(all_na)) }) test_that("Plot works with test ts", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") # Yearly data nh <- structure(c( NA, NA, 49.4, 51.1, 49.4, 47.9, 49.8, 50.9, 49.3, 51.9, 50.8, 49.6, 49.3, 50.6, 48.4, 50.7, 50.9, 50.6, 51.5, 52.8, 51.8, 51.1, 49.8, 50.2, 50.4, 51.6, 51.8, 50.9, 48.8, 51.7, 51, 50.6, 51.7, 51.5, 52.1, 51.3, 51, 54, 51.4, 52.7, 53.1, 54.6, 52, 52, 50.9, 52.6, 50.2, 52.6, 51.6, 51.9, 50.5, 50.9, 51.7, 51.4, 51.7, 50.8, 51.9, 51.8, NA, NA ), .Tsp = c( 1912, 1971, 1 ), class = "ts") expect_true(is_ggplot( ggplot_na_gapsize(nh) )) expect_true(is_ggplot( ggplot_na_gapsize(nh, title = "test") )) expect_true(is_ggplot( ggplot_na_gapsize(nh, title = "test") + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) )) } }) test_that(" tsAirgap data works", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap) )) expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap) + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) + ggplot2::ggtitle("hjsdhs") )) } }) test_that("Non standard input - data.frame, tsibble, tibble, zoo", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tibble", quietly = TRUE)) { warning("Pkg tibble needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tsibble", quietly = TRUE)) { warning("Pkg tsibble needed for this test.", call. = FALSE ) } else { require("zoo") require("ggplot2") require("tibble") require("tsibble") # data.frame tsAirgap_df <- data.frame(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap_df) )) # data.frame - multivariate - supposed to give Error tsAirgap_df2 <- data.frame(tsAirgap, tsAirgap) expect_error( ggplot_na_gapsize(tsAirgap_df2) ) # zoo and theme adjustment tsAirgap_zoo <- zoo::as.zoo(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap_zoo) + ggplot2::theme_minimal() )) # zoo - multivariate - supposed to give Error tsAirgap_zoo2 <- zoo(cbind(tsAirgap, tsAirgap), zoo::as.Date(zoo::as.yearmon(time(tsAirgap)))) expect_error( ggplot_na_gapsize(tsAirgap_zoo2) ) # tsibble tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap_tsibble) )) # tsibble just value, theme adjustment tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap_tsibble$value) + ggplot2::theme_minimal() )) # tsibble multivariate - plots first non index variable (maybe error would be better) tsAirgap_tsibble2 <- tsibble::as_tsibble(tsAirgap) tsAirgap_tsibble2$var2 <- tsAirgap expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap_tsibble2) )) # tibble tsAirgap_tibble <- tibble::as_tibble(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap_tibble) )) # tibble multivariate - plots first variable (maybe error would be better) tsAirgap_tibble2 <- tibble::as_tibble(data.frame(tsAirgap, tsAirgap)) expect_true(is_ggplot( ggplot_na_gapsize(tsAirgap_tibble2) )) } }) ================================================ FILE: tests/testthat/test-ggplot_na_gapsize2.R ================================================ context("ggplot_na_gapsize2") test_that("Check that all parameters of plot run without error", { if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, colors_bubbles = c("#FCFBFF", "#EFEEFA", "#DDDAEF", "#C8C3E2")))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, color_border = "red"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, alpha_bubbles = 0.1))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, labels_bubbles = "gap-occurrence"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, labels_bubbles = "gap"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, labels_bubbles = "total"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, labels_bubbles = "none"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, labels_bubbles = "occurrence"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, size_bubbles = 50))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, size_bubbles = 3))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, min_totals = 50))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, min_occurrence = 10))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, min_gap = 10))) expect_true(is_ggplot(ggplot_na_gapsize2(tsNH4, max_gap = 200))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, title = "test"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, subtitle = "test2"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, legend_breaks = 4))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, legend_breaks = 10))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, legend_point_sizes = "default"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, legend_point_sizes = "actual"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, legend_point_sizes = c(1, 2, 3, 4)))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, legend_title = "Legend"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, legend_position = "left"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, ylab = "test"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, xlab = "test"))) expect_true(is_ggplot(ggplot_na_gapsize2(tsAirgap, theme = ggplot2::theme_classic()))) } }) test_that("Errors for wrong input", { ## input not univariate x <- data.frame( x = runif(10, 0, 10), y = runif(10, 0, 10) ) expect_error(ggplot_na_gapsize2(x)) ## input not numeric x <- c("a", 1, NA, 3) expect_error(ggplot_na_gapsize2(x)) ## No NA values x <- 1:10 expect_error(ggplot_na_gapsize2(x)) all_na <- as.numeric(c(NA, NA, NA, NA, NA, NA, NA, NA)) expect_error(ggplot_na_gapsize2(all_na)) ## Empty plot because of too restrictive filters expect_error(ggplot_na_gapsize2(tsAirgap, min_totals = 400)) expect_error(ggplot_na_gapsize2(tsAirgap, max_gapsize = 0)) expect_error(ggplot_na_gapsize2(tsAirgap, min_occurrence = 11)) expect_error(ggplot_na_gapsize2(tsAirgap, max_gapsize = 0)) ## Wrong legend input for custom legend_point_sizes expect_error(ggplot_na_gapsize2(tsAirgap, legend_point_sizes = c(1, 2))) ## Wrong legend input for custom legend_point_sizes expect_error(ggplot_na_gapsize2(tsAirgap, legend_point_sizes = "falsch")) }) test_that("Plot works with test ts", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") # Yearly data nh <- structure(c( NA, NA, 49.4, 51.1, 49.4, 47.9, 49.8, 50.9, 49.3, 51.9, 50.8, 49.6, 49.3, 50.6, 48.4, 50.7, 50.9, 50.6, 51.5, 52.8, 51.8, 51.1, 49.8, 50.2, 50.4, 51.6, 51.8, 50.9, 48.8, 51.7, 51, 50.6, 51.7, 51.5, 52.1, 51.3, 51, 54, 51.4, 52.7, 53.1, 54.6, 52, 52, 50.9, 52.6, 50.2, 52.6, 51.6, 51.9, 50.5, 50.9, 51.7, 51.4, 51.7, 50.8, 51.9, 51.8, NA, NA ), .Tsp = c( 1912, 1971, 1 ), class = "ts") expect_true(is_ggplot( ggplot_na_gapsize2(nh) )) expect_true(is_ggplot( ggplot_na_gapsize2(nh, title = "test") )) expect_true(is_ggplot( ggplot_na_gapsize2(nh, title = "test") + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) )) } }) test_that(" tsAirgap data works", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else { require("ggplot2") expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap) )) expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap) + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::theme(plot.subtitle = ggplot2::element_text(hjust = 0.5)) + ggplot2::ggtitle("hjsdhs") )) } }) test_that("Non standard input - data.frame, tsibble, tibble, zoo", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tibble", quietly = TRUE)) { warning("Pkg tibble needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tsibble", quietly = TRUE)) { warning("Pkg tsibble needed for this test.", call. = FALSE ) } else { require("zoo") require("ggplot2") require("tibble") require("tsibble") # data.frame tsAirgap_df <- data.frame(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap_df) )) # data.frame - multivariate - supposed to give Error tsAirgap_df2 <- data.frame(tsAirgap, tsAirgap) expect_error( ggplot_na_gapsize2(tsAirgap_df2) ) # zoo and theme adjustment tsAirgap_zoo <- zoo::as.zoo(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap_zoo) + ggplot2::theme_minimal() )) # zoo - multivariate - supposed to give Error tsAirgap_zoo2 <- zoo(cbind(tsAirgap, tsAirgap), zoo::as.Date(zoo::as.yearmon(time(tsAirgap)))) expect_error( ggplot_na_gapsize2(tsAirgap_zoo2) ) # tsibble tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap_tsibble) )) # tsibble just value, theme adjustment tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap_tsibble$value) + ggplot2::theme_minimal() )) # tsibble multivariate - plots first non index variable (maybe error would be better) tsAirgap_tsibble2 <- tsibble::as_tsibble(tsAirgap) tsAirgap_tsibble2$var2 <- tsAirgap expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap_tsibble2) )) # tibble tsAirgap_tibble <- tibble::as_tibble(tsAirgap) expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap_tibble) )) # tibble multivariate - plots first variable (maybe error would be better) tsAirgap_tibble2 <- tibble::as_tibble(data.frame(tsAirgap, tsAirgap)) expect_true(is_ggplot( ggplot_na_gapsize2(tsAirgap_tibble2) )) } }) ================================================ FILE: tests/testthat/test-ggplot_na_imputations.R ================================================ context("ggplot_na_imputations") test_that("Old functions give error", { imp_mean <- na_mean(tsAirgap) expect_error(plotNA.imputations(tsAirgap, imp_mean)) }) test_that("Check that all parameters of plot run without error", { if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } { require("ggplot2") imp_mean <- na_mean(tsAirgap) expect_true(is_ggplot(ggplot_na_imputations(tsAirgap, imp_mean))) expect_true(is_ggplot(ggplot_na_imputations(tsAirgap, imp_mean, tsAirgapComplete))) expect_true(is_ggplot(ggplot_na_imputations( x_with_na = tsAirgap, x_with_imputations = imp_mean ))) expect_true(is_ggplot(ggplot_na_imputations( x_with_na = tsAirgap, x_with_imputations = imp_mean, x_with_truth = tsAirgapComplete, title = "test", subtitle = "test", xlab = "test", ylab = "test" ))) expect_true(is_ggplot(ggplot_na_imputations( x_with_na = tsAirgap, x_with_imputations = imp_mean, x_with_truth = tsAirgapComplete, color_points = "gold", color_imputations = "blue", color_truth = "yellow" ))) expect_true(is_ggplot(ggplot_na_imputations( x_with_na = tsAirgap, x_with_imputations = imp_mean, x_with_truth = tsAirgapComplete, shape_points = 15, shape_imputations = 15, shape_truth = 15 ))) expect_true(is_ggplot(ggplot_na_imputations( x_with_na = tsAirgap, x_with_imputations = imp_mean, x_with_truth = tsAirgapComplete, size_points = 2, size_imputations = 2.2, size_truth = 2 ))) expect_true(is_ggplot(ggplot_na_imputations( x_with_na = tsAirgap, x_with_imputations = imp_mean, x_with_truth = tsAirgapComplete, width_lines = 0.6, linetype = "dotted" ))) expect_true(is_ggplot(ggplot_na_imputations( x_with_na = tsAirgap, x_with_imputations = imp_mean, x_with_truth = tsAirgapComplete, connect_na = FALSE, legend = FALSE ))) expect_true(is_ggplot(ggplot_na_imputations( x_with_na = tsAirgap, x_with_imputations = imp_mean, x_with_truth = tsAirgapComplete, legend_size = 6, label_known = "known", label_imputations = "imputed", label_truth = "truth", theme = ggplot2::theme_classic() ))) } }) test_that("Errors for wrong input", { ## input not univariate x <- data.frame( x = runif(10, 0, 10), y = runif(10, 0, 10) ) expect_error(ggplot_na_imputations(x, tsAirgapComplete)) expect_error(ggplot_na_imputations(tsAirgapComplete, x)) expect_error(ggplot_na_imputations(tsAirgap, tsAirgapComplete, x)) ## input not numeric x <- c("a", 1, 2, 3) expect_error(ggplot_na_imputations(x, tsAirgapComplete)) expect_error(ggplot_na_imputations(tsAirgapComplete, x)) expect_error(ggplot_na_imputations(tsAirgap, tsAirgapComplete, x)) # input only NA all_na <- as.numeric(rep(NA, 144)) expect_error(ggplot_na_imputations(all_na, tsAirgapComplete)) expect_error(ggplot_na_imputations(tsAirgapComplete, all_na)) # Input datasets do not belong together / length of input data different ## No NA values for x_with_na x <- 1:144 expect_error(ggplot_na_imputations(x, tsAirgapComplete)) }) test_that("Works with tsNH4", { expect_true(is_ggplot( ggplot_na_imputations(tsNH4, na_mean(tsNH4), tsNH4Complete) )) }) test_that("Plot with x_axis_labels works and yearly data works", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else { require("zoo") require("ggplot2") # Yearly data nh <- structure(c( NA, NA, 49.4, 51.1, 49.4, 47.9, 49.8, 50.9, 49.3, 51.9, 50.8, 49.6, 49.3, 50.6, 48.4, 50.7, 50.9, 50.6, 51.5, 52.8, 51.8, 51.1, 49.8, 50.2, 50.4, 51.6, 51.8, 50.9, 48.8, 51.7, 51, 50.6, 51.7, 51.5, 52.1, 51.3, 51, 54, 51.4, 52.7, 53.1, 54.6, 52, 52, 50.9, 52.6, 50.2, 52.6, 51.6, 51.9, 50.5, 50.9, 51.7, 51.4, 51.7, 50.8, 51.9, 51.8, NA, NA ), .Tsp = c( 1912, 1971, 1 ), class = "ts") # Use zoo to change ts time information to yearmon vector # Afterwards create Date vector and from this Date vector POSIXct nh_yearmon <- zoo::as.yearmon(time(nh)) nh_date <- zoo::as.Date(nh_yearmon) nh_posix <- as.POSIXct(nh_date) expect_true(is_ggplot( ggplot_na_imputations(nh, na_mean(nh)) )) expect_true(is_ggplot( ggplot_na_imputations(nh, na_mean(nh), x_axis_labels = nh_date) )) expect_true(is_ggplot( ggplot_na_imputations(nh, na_mean(nh), x_axis_labels = nh_posix) )) expect_true(is_ggplot( ggplot_na_imputations(nh, na_mean(nh), x_axis_labels = nh_posix, title = "test") )) expect_true(is_ggplot( ggplot_na_imputations(nh, na_mean(nh), x_axis_labels = nh_posix, title = "test") + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1)) )) } }) test_that("Non standard input - data.frame, tsibble, tibble, zoo", { skip_on_cran() if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("Pkg ggplot2 needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tibble", quietly = TRUE)) { warning("Pkg tibble needed for this test.", call. = FALSE ) } else if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else if (!requireNamespace("tsibble", quietly = TRUE)) { warning("Pkg tsibble needed for this test.", call. = FALSE ) } else { require("zoo") require("ggplot2") require("tibble") require("tsibble") # data.frame tsAirgap_df <- data.frame(tsAirgap) expect_true(is_ggplot( ggplot_na_imputations(tsAirgap_df, na_mean(tsAirgap_df)) )) # data.frame - multivariate - supposed to give Error tsAirgap_df2 <- data.frame(tsAirgap, tsAirgap) expect_error( ggplot_na_imputations(tsAirgap_df2, na_mean(tsAirgap_df2)) ) # zoo and theme adjustment tsAirgap_zoo <- zoo::as.zoo(tsAirgap) expect_true(is_ggplot( ggplot_na_imputations(tsAirgap_zoo, na_mean(tsAirgap_zoo)) + ggplot2::theme_minimal() )) # zoo - multivariate - supposed to give Error tsAirgap_zoo2 <- zoo(cbind(tsAirgap, tsAirgap), zoo::as.Date(zoo::as.yearmon(time(tsAirgap)))) expect_error( ggplot_na_imputations(tsAirgap_zoo2, na_mean(tsAirgap_zoo2)) ) # tsibble tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_imputations(tsAirgap_tsibble, na_mean(tsAirgap_tsibble)) )) # tsibble just value, theme adjustment tsAirgap_tsibble <- tsibble::as_tsibble(tsAirgap) expect_true(is_ggplot( ggplot_na_imputations(tsAirgap_tsibble$value, na_mean(tsAirgap_tsibble$value)) + ggplot2::theme_minimal() )) # tsibble multivariate - plots first non index variable (maybe error would be better) tsAirgap_tsibble2 <- tsibble::as_tsibble(tsAirgap) tsAirgap_tsibble2$var2 <- tsAirgap expect_true(is_ggplot( ggplot_na_imputations(tsAirgap_tsibble2, na_mean(tsAirgap_tsibble2)) )) # tibble tsAirgap_tibble <- tibble::as_tibble(tsAirgap) expect_true(is_ggplot( ggplot_na_imputations(tsAirgap_tibble, na_mean(tsAirgap_tibble)) )) # tibble multivariate - plots first variable (maybe error would be better) tsAirgap_tibble2 <- tibble::as_tibble(data.frame(tsAirgap, tsAirgap)) expect_true(is_ggplot( ggplot_na_imputations(tsAirgap_tibble2, na_mean(tsAirgap_tibble2)) )) } }) ================================================ FILE: tests/testthat/test-input-na_advanced-tsObjects.R ================================================ context("Advanced Time Series Objects Input") test_that("tsibble objects", { skip_on_cran() if (!requireNamespace("tsibble", quietly = TRUE)) { warning("Pkg tsibble needed for this test.", call. = FALSE ) } else { require("tsibble") x <- as_tsibble(tsAirgap) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) } }) test_that("Multivariate tsibble ", { skip_on_cran() if (!requireNamespace("tsibble", quietly = TRUE)) { warning("Pkg tsibble needed for this test.", call. = FALSE ) } else { require("tsibble") x <- data.frame(tsAirgap, tsAirgap) x <- as.ts(x) x <- as_tsibble(x) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) } }) test_that("tibble objects", { skip_on_cran() if (!requireNamespace("tibble", quietly = TRUE)) { warning("Pkg tibble needed for this test.", call. = FALSE ) } else { require("tibble") x <- as_tibble(tsAirgap) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) } }) test_that("multivariate tibble objects", { skip_on_cran() if (!requireNamespace("tibble", quietly = TRUE)) { warning("Pkg tibble needed for this test.", call. = FALSE ) } else { require("tibble") x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) z <- as_tibble(x) expect_false(anyNA(na_mean(z))) expect_false(anyNA(na_kalman(z))) expect_false(anyNA(na_interpolation(z))) expect_false(anyNA(na_locf(z))) expect_false(anyNA(na_ma(z))) expect_false(anyNA(na_random(z))) expect_false(anyNA(na_seadec(z))) expect_false(anyNA(na_seasplit(z))) expect_false(anyNA(na_replace(z))) } }) test_that("multivariate tibble objects - non-numeric input", { skip_on_cran() if (!requireNamespace("tibble", quietly = TRUE)) { warning("Pkg tibble needed for this test.", call. = FALSE ) } else { require("tibble") x <- data.frame(rep("x", 144), tsAirgap, tsAirgap, rep("qq", 144), tsAirgapComplete) z <- as_tibble(x) expect_false(anyNA(na_mean(z))) expect_false(anyNA(na_kalman(z))) expect_false(anyNA(na_interpolation(z))) expect_false(anyNA(na_locf(z))) expect_false(anyNA(na_ma(z))) expect_false(anyNA(na_random(z))) expect_false(anyNA(na_seadec(z))) expect_false(anyNA(na_seasplit(z))) expect_false(anyNA(na_replace(z))) } }) test_that("zoo objects", { skip_on_cran() if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else { require("zoo") x <- as.zoo(tsAirgap) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) } }) test_that("multivariate zoo objects", { skip_on_cran() if (!requireNamespace("zoo", quietly = TRUE)) { warning("Pkg zoo needed for this test.", call. = FALSE ) } else { require("zoo") require("stats") time <- base::seq( from = zoo::as.Date(zoo::as.yearmon(stats::start(tsAirgap)))[1], by = "month", length.out = length(tsAirgap) ) x <- data.frame(time, zoo::coredata(tsAirgap), zoo::coredata(tsAirgap), zoo::coredata(tsAirgapComplete)) z <- zoo::read.zoo(x, format = "%Y-%m-%d") expect_false(anyNA(na_mean(z))) expect_false(anyNA(na_kalman(z))) expect_false(anyNA(na_interpolation(z))) expect_false(anyNA(na_locf(z))) expect_false(anyNA(na_ma(z))) expect_false(anyNA(na_random(z))) expect_false(anyNA(na_seadec(z))) expect_false(anyNA(na_seasplit(z))) expect_false(anyNA(na_replace(z))) } }) test_that("xts objects", { skip_on_cran() if (!requireNamespace("xts", quietly = TRUE)) { warning("Pkg xts needed for this test.", call. = FALSE ) } else { require("xts") x <- as.xts(tsAirgap) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) } }) test_that("timeSeries objects", { skip_on_cran() if (!requireNamespace("timeSeries", quietly = TRUE)) { warning("Pkg timeSeries needed for this test.", call. = FALSE ) } else { require("timeSeries") x <- as.timeSeries(tsAirgap) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) } }) test_that("tis objects", { skip_on_cran() if (!requireNamespace("tis", quietly = TRUE)) { warning("Pkg timeSeries needed for this test.", call. = FALSE ) } else { require("tis") x <- as.tis(tsAirgap) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) # expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) } }) test_that("Imputation works for mts", { skip_on_cran() # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) x <- ts(x) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) }) test_that("Imputation works for data.frame", { skip_on_cran() # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) }) test_that("Imputation works for data.frame univariate", { skip_on_cran() # Checking if NAs remain in data.frame x <- data.frame(tsAirgap) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) }) test_that("Imputation works for matrix", { skip_on_cran() # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) x <- as.matrix(x) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) }) test_that("Imputation works for matrix univariate", { skip_on_cran() # Checking if NAs remain in data.frame x <- data.frame(tsAirgap) x <- as.matrix(x) expect_false(anyNA(na_mean(x))) expect_false(anyNA(na_kalman(x))) expect_false(anyNA(na_interpolation(x))) expect_false(anyNA(na_locf(x))) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_random(x))) expect_false(anyNA(na_seadec(x))) expect_false(anyNA(na_seasplit(x))) expect_false(anyNA(na_replace(x))) }) ================================================ FILE: tests/testthat/test-na_interpolation.R ================================================ context("na_interpolation") test_that("All NA vector throws error", { expect_error(na_interpolation(c(NA, NA, NA, NA, NA))) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_interpolation(x, option = "linear", rule = 1), na.rm = T), digits = 1), 264.9) expect_equal(round(mean(na_interpolation(x, option = "linear")), digits = 1), 273.6) expect_equal(round(mean(na_interpolation(x, option = "spline")), digits = 1), 276.2) expect_equal(round(mean(na_interpolation(x, option = "stine")), digits = 1), 273.4) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_interpolation(x, option = "linear", rule = 1), na.rm = T), digits = 1), 286.4) expect_equal(round(mean(na_interpolation(x, option = "linear")), digits = 1), 281.1) expect_equal(round(mean(na_interpolation(x, option = "spline")), digits = 1), 283.0) expect_equal(round(mean(na_interpolation(x, option = "stine")), digits = 1), 280.8) }) test_that("Correct results for all options with the tsAirgap dataset", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_interpolation(x, option = "linear", rule = 1), na.rm = T), digits = 1), 280.7) expect_equal(round(mean(na_interpolation(x, option = "linear")), digits = 1), 280.7) expect_equal(round(mean(na_interpolation(x, option = "spline")), digits = 1), 280.1) expect_equal(round(mean(na_interpolation(x, option = "stine")), digits = 1), 280.5) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_interpolation(x, option = "linear", rule = 1))) expect_false(anyNA(na_interpolation(x, option = "linear"))) expect_false(anyNA(na_interpolation(x, option = "spline"))) expect_false(anyNA(na_interpolation(x, option = "stine"))) }) test_that("Warning for wrong input for option parameter", { expect_error(na_interpolation(tsAirgap, option = "wrongOption")) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:2] <- NA expect_equal(length(na.omit(na_interpolation(x, option = "linear", rule = 1))), 142) expect_false(anyNA(na_interpolation(x, option = "linear"))) expect_false(anyNA(na_interpolation(x, option = "spline"))) expect_false(anyNA(na_interpolation(x, option = "stine"))) expect_false(anyNA(na_interpolation(x))) }) test_that("Test NA at end", { x <- tsAirgap x[143:144] <- NA expect_equal(length(na.omit(na_interpolation(x, option = "linear", rule = 1))), 142) expect_false(anyNA(na_interpolation(x, option = "linear"))) expect_false(anyNA(na_interpolation(x, option = "spline"))) expect_false(anyNA(na_interpolation(x, option = "stine"))) expect_false(anyNA(na_interpolation(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_equal(length(na.omit(na_interpolation(x, option = "linear", rule = 1))), 144) expect_false(anyNA(na_interpolation(x, option = "linear"))) expect_false(anyNA(na_interpolation(x, option = "spline"))) expect_false(anyNA(na_interpolation(x, option = "stine"))) expect_false(anyNA(na_interpolation(x))) }) test_that("Over 90% NAs", { x <- tsAirgap x[10:140] <- NA expect_equal(length(na.omit(na_interpolation(x, option = "linear", rule = 1))), 144) expect_false(anyNA(na_interpolation(x, option = "linear"))) expect_false(anyNA(na_interpolation(x, option = "spline"))) expect_false(anyNA(na_interpolation(x, option = "stine"))) expect_false(anyNA(na_interpolation(x))) }) ================================================ FILE: tests/testthat/test-na_kalman.R ================================================ context("na_kalman") test_that("All NA vector throws error", { expect_error(na_kalman(c(NA, NA, NA, NA, NA))) }) test_that("Workaround solution for constant values for StructTS works", { x <- c(4,4,4,4,NA,4,4,4) expect_equal(round(mean(na_kalman(x, model = "StructTS", smooth = T)), digits = 1), 4) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_kalman(x, model = "auto.arima", smooth = T)), digits = 1), 280.3) expect_equal(round(mean(na_kalman(x, model = "auto.arima", smooth = F)), digits = 1), 279.2) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_kalman(x, model = "auto.arima", smooth = T)), digits = 1), 284.8) expect_equal(round(mean(na_kalman(x, model = "auto.arima", smooth = F)), digits = 1), 291.6) }) test_that("Correct results for all options with the tsAirgap dataset", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_kalman(x, model = "auto.arima", smooth = T)), digits = 1), 280.2) expect_equal(round(mean(na_kalman(x, model = "auto.arima", smooth = F)), digits = 1), 279.8) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = T))) expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = F))) }) test_that("Warning for wrong input for model parameter", { expect_error(na_kalman(tsAirgap, model = "wrongModel")) }) test_that("Warning for wrong input for smooth parameter", { expect_error(na_kalman(tsAirgap, smooth = "Wrong")) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:2] <- NA expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = T))) expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = F))) expect_false(anyNA(na_kalman(x, model = "StructTS", smooth = T))) expect_false(anyNA(na_kalman(x, model = "StructTS", smooth = F))) expect_false(anyNA(na_kalman(x))) }) test_that("Test NA at end", { x <- tsAirgap x[143:144] <- NA expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = T))) expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = F))) expect_false(anyNA(na_kalman(x, model = "StructTS", smooth = T))) expect_false(anyNA(na_kalman(x, model = "StructTS", smooth = F))) expect_false(anyNA(na_kalman(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = T))) expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = F))) expect_false(anyNA(na_kalman(x, model = "StructTS", smooth = T))) expect_false(anyNA(na_kalman(x, model = "StructTS", smooth = F))) expect_false(anyNA(na_kalman(x))) }) test_that("Over 50% NAs", { x <- tsAirgap x[30:100] <- NA expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = T))) expect_false(anyNA(na_kalman(x, model = "auto.arima", smooth = F))) expect_false(anyNA(na_kalman(x, model = "StructTS", smooth = T))) expect_false(anyNA(na_kalman(x, model = "StructTS", smooth = F))) expect_false(anyNA(na_kalman(x))) }) ================================================ FILE: tests/testthat/test-na_locf.R ================================================ context("na_locf") test_that("All NA vector throws error", { expect_error(na_locf(c(NA, NA, NA, NA, NA))) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "mean")), digits = 1), 271.9) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "mean")), digits = 1), 266.7) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "rev")), digits = 1), 271.9) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "rev")), digits = 1), 275.3) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "rm")), digits = 1), 271.9) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "rm")), digits = 1), 266.7) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "keep"), na.rm = T), digits = 1), 271.9) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "keep"), na.rm = T), digits = 1), 266.7) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "mean")), digits = 1), 284.3) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "mean")), digits = 1), 283.0) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "rev")), digits = 1), 279.2) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "rev")), digits = 1), 283.0) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "rm")), digits = 1), 284.3) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "rm")), digits = 1), 283.0) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "keep"), na.rm = T), digits = 1), 284.3) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "keep"), na.rm = T), digits = 1), 283.0) }) test_that("Correct results for all options with the tsAirgap dataset", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "mean")), digits = 1), 278.8) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "mean")), digits = 1), 282.7) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "rev")), digits = 1), 278.8) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "rev")), digits = 1), 282.7) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "rm")), digits = 1), 278.8) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "rm")), digits = 1), 282.7) expect_equal(round(mean(na_locf(x, option = "locf", na_remaining = "keep"), na.rm = T), digits = 1), 278.8) expect_equal(round(mean(na_locf(x, option = "nocb", na_remaining = "keep"), na.rm = T), digits = 1), 282.7) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "rev"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "rev"))) expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "rm"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "rm"))) }) test_that("Warning for wrong input for option parameter", { expect_error(na_locf(tsAirgap, option = "wrongOption")) }) test_that("Warning for wrong input for na_remaining parameter", { x <- tsAirgap x[1:2] <- NA expect_error(na_locf(x, na_remaining = "Wrong")) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:2] <- NA expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "rev"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "rev"))) expect_equal(length(na_locf(x, option = "nocb", na_remaining = "rm")), 144) expect_equal(length(na_locf(x, option = "locf", na_remaining = "rm")), 142) expect_equal(length(na_locf(x, option = "nocb", na_remaining = "keep")), 144) expect_false(anyNA(na_locf(x))) }) test_that("Test NA at end", { x <- tsAirgap x[143:144] <- NA expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "rev"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "rev"))) expect_equal(length(na_locf(x, option = "nocb", na_remaining = "rm")), 142) expect_equal(length(na_locf(x, option = "locf", na_remaining = "rm")), 144) expect_false(anyNA(na_locf(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "rev"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "rev"))) expect_equal(length(na_locf(x, option = "nocb", na_remaining = "rm")), 144) expect_equal(length(na_locf(x, option = "locf", na_remaining = "rm")), 144) expect_false(anyNA(na_locf(x))) }) test_that("Over 90% NAs", { x <- tsAirgap x[10:140] <- NA expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "locf", na_remaining = "rev"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "mean"))) expect_false(anyNA(na_locf(x, option = "nocb", na_remaining = "rev"))) expect_false(anyNA(na_locf(x))) }) ================================================ FILE: tests/testthat/test-na_ma.R ================================================ context("na_ma") test_that("All NA vector throws error", { expect_error(na_ma(c(NA, NA, NA, NA, NA))) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_ma(x, weighting = "simple", k = 4)), digits = 1), 275.2) expect_equal(round(mean(na_ma(x, weighting = "simple", k = 20)), digits = 1), 276.1) expect_equal(round(mean(na_ma(x, weighting = "linear", k = 4)), digits = 1), 275.0) expect_equal(round(mean(na_ma(x, weighting = "linear", k = 20)), digits = 1), 276.1) expect_equal(round(mean(na_ma(x, weighting = "exponential", k = 4)), digits = 1), 274.6) expect_equal(round(mean(na_ma(x, weighting = "exponential", k = 20)), digits = 1), 274.7) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_ma(x, weighting = "simple", k = 4)), digits = 1), 282.1) expect_equal(round(mean(na_ma(x, weighting = "simple", k = 20)), digits = 1), 281.1) expect_equal(round(mean(na_ma(x, weighting = "linear", k = 4)), digits = 1), 281.9) expect_equal(round(mean(na_ma(x, weighting = "linear", k = 20)), digits = 1), 281.3) expect_equal(round(mean(na_ma(x, weighting = "exponential", k = 4)), digits = 1), 281.7) expect_equal(round(mean(na_ma(x, weighting = "exponential", k = 20)), digits = 1), 281.6) }) test_that("Correct results for all options with the tsAirgap dataset", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_ma(x, weighting = "simple", k = 4)), digits = 1), 281.5) expect_equal(round(mean(na_ma(x, weighting = "simple", k = 20)), digits = 1), 280.7) expect_equal(round(mean(na_ma(x, weighting = "linear", k = 4)), digits = 1), 281.3) expect_equal(round(mean(na_ma(x, weighting = "linear", k = 20)), digits = 1), 280.9) expect_equal(round(mean(na_ma(x, weighting = "exponential", k = 4)), digits = 1), 281.2) expect_equal(round(mean(na_ma(x, weighting = "exponential", k = 20)), digits = 1), 281.1) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_ma(x))) expect_false(anyNA(na_ma(x, weighting = "simple", k = 4))) expect_false(anyNA(na_ma(x, weighting = "simple", k = 20))) expect_false(anyNA(na_ma(x, weighting = "linear", k = 4))) expect_false(anyNA(na_ma(x, weighting = "linear", k = 20))) expect_false(anyNA(na_ma(x, weighting = "exponential", k = 4))) expect_false(anyNA(na_ma(x, weighting = "exponential", k = 20))) }) test_that("Warning for wrong input for k parameter", { expect_error(na_ma(tsAirgap, k = -1)) }) test_that("Warning for wrong input for weighting parameter", { expect_error(na_ma(tsAirgap, weighting = "Wrong")) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:2] <- NA expect_false(anyNA(na_ma(x, k = 4, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 4, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 4, weighting = "exponential"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "exponential"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "exponential"))) expect_false(anyNA(na_ma(x))) }) test_that("Test NA at end", { x <- tsAirgap x[143:144] <- NA expect_false(anyNA(na_ma(x, k = 4, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 4, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 4, weighting = "exponential"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "exponential"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "exponential"))) expect_false(anyNA(na_ma(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_false(anyNA(na_ma(x, k = 4, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 4, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 4, weighting = "exponential"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "exponential"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "exponential"))) expect_false(anyNA(na_ma(x))) }) test_that("Over 90% NAs", { x <- tsAirgap x[10:140] <- NA expect_false(anyNA(na_ma(x, k = 4, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "simple"))) expect_false(anyNA(na_ma(x, k = 4, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "linear"))) expect_false(anyNA(na_ma(x, k = 4, weighting = "exponential"))) expect_false(anyNA(na_ma(x, k = 1, weighting = "exponential"))) expect_false(anyNA(na_ma(x, k = 20, weighting = "exponential"))) expect_false(anyNA(na_ma(x))) }) ================================================ FILE: tests/testthat/test-na_mean.R ================================================ context("na_mean") test_that("All NA vector throws error", { expect_error(na_mean(c(NA, NA, NA, NA, NA))) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_mean(x, option = "median")), digits = 1), 260.0) expect_equal(round(mean(na_mean(x, option = "mean")), digits = 1), 264.1) expect_equal(round(mean(na_mean(x, option = "mode")), digits = 1), 258.8) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_mean(x, option = "median")), digits = 1), 282.7) expect_equal(round(mean(na_mean(x, option = "mean")), digits = 1), 284.8) expect_equal(round(mean(na_mean(x, option = "mode")), digits = 1), 278.2) }) test_that("Correct results for all options with the tsAirgap dataset", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_mean(x, option = "median")), digits = 1), 277.9) expect_equal(round(mean(na_mean(x, option = "mean")), digits = 1), 279.8) expect_equal(round(mean(na_mean(x, option = "mode")), digits = 1), 275.2) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_mean(x, option = "mean"))) expect_false(anyNA(na_mean(x, option = "mode"))) expect_false(anyNA(na_mean(x, option = "median"))) }) test_that("Warning for wrong input for option parameter", { expect_error(na_mean(tsAirgap, option = "Wrong")) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:4] <- NA expect_false(anyNA(na_mean(x, option = "mean"))) expect_false(anyNA(na_mean(x, option = "mode"))) expect_false(anyNA(na_mean(x, option = "median"))) expect_false(anyNA(na_mean(x))) }) test_that("Test NA at end", { x <- tsAirgap x[140:144] <- NA expect_false(anyNA(na_mean(x, option = "mean"))) expect_false(anyNA(na_mean(x, option = "mode"))) expect_false(anyNA(na_mean(x, option = "median"))) expect_false(anyNA(na_mean(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_false(anyNA(na_mean(x, option = "mean"))) expect_false(anyNA(na_mean(x, option = "mode"))) expect_false(anyNA(na_mean(x, option = "median"))) expect_false(anyNA(na_mean(x))) }) test_that("Over 90% NAs", { x <- tsAirgap x[10:140] <- NA expect_false(anyNA(na_mean(x, option = "mean"))) expect_false(anyNA(na_mean(x, option = "mode"))) expect_false(anyNA(na_mean(x, option = "median"))) expect_false(anyNA(na_mean(x))) }) ================================================ FILE: tests/testthat/test-na_random.R ================================================ context("na_random") test_that("All NA vector throws error", { expect_error(na_random(c(NA, NA, NA, NA, NA))) }) test_that("Wrong input", { x <- data.frame(tsAirgap) expect_error(na_random(x, lower_bound = 1, upper_bound = -1)) x <- rep("string", 144) x[3] <- NA expect_error(na_random(x)) x <- rep(NA, 144) expect_error(na_random(x)) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_random(x))) }) test_that("Error for lower_bound > upper_bound", { expect_error(na_random(tsAirgap, lower_bound = 300, upper_bound = 100)) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:2] <- NA expect_false(anyNA(na_random(x, lower_bound = 100, upper_bound = 200))) expect_false(anyNA(na_random(x, lower_bound = -10, upper_bound = 5000))) expect_false(anyNA(na_random(x))) }) test_that("Test NA at end", { x <- tsAirgap x[143:144] <- NA expect_false(anyNA(na_random(x, lower_bound = 100, upper_bound = 200))) expect_false(anyNA(na_random(x, lower_bound = -10, upper_bound = 5000))) expect_false(anyNA(na_random(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_false(anyNA(na_random(x, lower_bound = 100, upper_bound = 200))) expect_false(anyNA(na_random(x, lower_bound = -10, upper_bound = 5000))) expect_false(anyNA(na_random(x))) }) test_that("Over 90% NAs", { x <- tsAirgap x[10:140] <- NA expect_false(anyNA(na_random(x, lower_bound = 100, upper_bound = 200))) expect_false(anyNA(na_random(x, lower_bound = -10, upper_bound = 5000))) expect_false(anyNA(na_random(x))) }) ================================================ FILE: tests/testthat/test-na_remove.R ================================================ context("na_remove") test_that("All NA vector throws error", { expect_error(na_remove(c(NA, NA, NA, NA, NA))) }) test_that("Wrong input", { x <- data.frame(tsAirgap, tsAirgap) expect_error(na_remove(x)) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_remove(x)), digits = 1), 264.1) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_remove(x)), digits = 1), 284.8) }) test_that("Correct results for all options with the tsAirgap dataset", { # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_remove(x)), digits = 1), 279.8) }) test_that("Test NA at beginning", { x <- tsAirgapComplete x[1:2] <- NA expect_equal(length(na_remove(x)), 142) }) test_that("Test NA at end", { x <- tsAirgapComplete x[143:144] <- NA expect_equal(length(na_remove(x)), 142) }) test_that("Multiple NAs in a row", { x <- tsAirgapComplete x[40:80] <- NA expect_equal(length(na_remove(x)), 103) }) test_that("Over 90% NAs", { x <- tsAirgapComplete x[10:140] <- NA expect_equal(length(na_remove(x)), 13) }) ================================================ FILE: tests/testthat/test-na_replace.R ================================================ context("na_replace") test_that("All NA vector throws no error", { expect_equal(sum(na_replace(c(NA, NA, NA, NA, NA), fill = 2.0)), 10.0) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_replace(x, fill = -1)), digits = 1), 223.6) expect_equal(round(mean(na_replace(x, fill = 200)), digits = 1), 254.3) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_replace(x, fill = -1)), digits = 1), 251.0) expect_equal(round(mean(na_replace(x, fill = 200)), digits = 1), 274.8) }) test_that("Correct results for all options with the tsAirgap dataset", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_replace(x, fill = -1)), digits = 1), 254.5) expect_equal(round(mean(na_replace(x, fill = 200)), digits = 1), 272.6) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_replace(x, fill = -1))) expect_false(anyNA(na_replace(x, fill = 200))) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:2] <- NA expect_false(anyNA(na_replace(x, fill = -5))) expect_false(anyNA(na_replace(x, fill = 1000))) expect_false(anyNA(na_replace(x))) }) test_that("Test NA at end", { x <- tsAirgap x[143:144] <- NA expect_false(anyNA(na_replace(x, fill = -5))) expect_false(anyNA(na_replace(x, fill = 1000))) expect_false(anyNA(na_replace(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_false(anyNA(na_replace(x, fill = -5))) expect_false(anyNA(na_replace(x, fill = 1000))) expect_false(anyNA(na_replace(x))) }) test_that("Over 90% NAs", { x <- tsAirgap x[10:140] <- NA expect_false(anyNA(na_replace(x, fill = -5))) expect_false(anyNA(na_replace(x, fill = 1000))) expect_false(anyNA(na_replace(x))) }) test_that("Handling for no NAs", { x <- tsAirgapComplete expect_false(anyNA(na_replace(x))) }) ================================================ FILE: tests/testthat/test-na_seadec.R ================================================ context("na_seadec") test_that("All NA vector throws error", { expect_error(na_seadec(c(NA, NA, NA, NA, NA))) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_seadec(x, algorithm = "interpolation")), digits = 1), 276.7) expect_equal(round(mean(na_seadec(x, algorithm = "locf")), digits = 1), 276.0) expect_equal(round(mean(na_seadec(x, algorithm = "mean")), digits = 1), 264.3) expect_true(round(mean(na_seadec(x, algorithm = "kalman", model = "auto.arima")), digits = 1) > 277 & round(mean(na_seadec(x, algorithm = "kalman", model = "auto.arima")), digits = 1) < 283) expect_equal(round(mean(na_seadec(x, algorithm = "ma")), digits = 1), 277.1) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_seadec(x, algorithm = "interpolation")), digits = 1), 279.9) expect_equal(round(mean(na_seadec(x, algorithm = "locf")), digits = 1), 279.2) expect_equal(round(mean(na_seadec(x, algorithm = "mean")), digits = 1), 284.1) expect_true(round(mean(na_seadec(x, algorithm = "kalman", model = "auto.arima")), digits = 1) > 282 & round(mean(na_seadec(x, algorithm = "kalman", model = "auto.arima")), digits = 1) < 288) expect_equal(round(mean(na_seadec(x, algorithm = "ma")), digits = 1), 280.0) }) test_that("Given frequency is overwritten by findFrequency=T", { x <- ts(data = c(1,1,99,99,1,1,99,99,1,NA,99,99,1,1,99,99,1,1,99,99,1,1,99,99,1,1), frequency = 12) imp <- na_seadec(x, find_frequency = TRUE) # new detected frequency = 4 expect_equal(frequency(imp),4) }) test_that("Correct results for all options with the tsAirgap dataset", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_seadec(x, algorithm = "interpolation")), digits = 1), 280.4) expect_equal(round(mean(na_seadec(x, algorithm = "locf")), digits = 1), 279.7) expect_equal(round(mean(na_seadec(x, algorithm = "mean")), digits = 1), 279.5) expect_true(round(mean(na_seadec(x, algorithm = "kalman", model = "auto.arima")), digits = 1) > 277 & round(mean(na_seadec(x, algorithm = "kalman", model = "auto.arima")), digits = 1) < 283) expect_equal(round(mean(na_seadec(x, algorithm = "ma")), digits = 1), 280.6) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_seadec(x, algorithm = "interpolation"))) expect_false(anyNA(na_seadec(x, algorithm = "locf"))) expect_false(anyNA(na_seadec(x, algorithm = "mean"))) expect_false(anyNA(na_seadec(x, algorithm = "random"))) expect_false(anyNA(na_seadec(x, algorithm = "kalman"))) expect_false(anyNA(na_seadec(x, algorithm = "ma"))) }) test_that("Warning for wrong input for algorithm parameter", { expect_error(na_seadec(tsAirgap, algorithm = "wrong")) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:2] <- NA expect_false(anyNA(na_seadec(x, algorithm = "interpolation"))) expect_false(anyNA(na_seadec(x, algorithm = "kalman"))) expect_false(anyNA(na_seadec(x, algorithm = "locf"))) expect_false(anyNA(na_seadec(x, algorithm = "ma"))) expect_false(anyNA(na_seadec(x, algorithm = "mean"))) expect_false(anyNA(na_seadec(x, algorithm = "random"))) expect_false(anyNA(na_seadec(x))) }) test_that("Test NA at end", { x <- tsAirgap x[143:144] <- NA expect_false(anyNA(na_seadec(x, algorithm = "interpolation"))) expect_false(anyNA(na_seadec(x, algorithm = "kalman"))) expect_false(anyNA(na_seadec(x, algorithm = "locf"))) expect_false(anyNA(na_seadec(x, algorithm = "ma"))) expect_false(anyNA(na_seadec(x, algorithm = "mean"))) expect_false(anyNA(na_seadec(x, algorithm = "random"))) expect_false(anyNA(na_seadec(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_false(anyNA(na_seadec(x, algorithm = "interpolation"))) expect_false(anyNA(na_seadec(x, algorithm = "kalman"))) expect_false(anyNA(na_seadec(x, algorithm = "locf"))) expect_false(anyNA(na_seadec(x, algorithm = "ma"))) expect_false(anyNA(na_seadec(x, algorithm = "mean"))) expect_false(anyNA(na_seadec(x, algorithm = "random"))) expect_false(anyNA(na_seadec(x))) }) test_that("Over 90% NAs", { x <- tsAirgap x[10:140] <- NA expect_false(anyNA(na_seadec(x, algorithm = "interpolation"))) expect_false(anyNA(na_seadec(x, algorithm = "kalman"))) expect_false(anyNA(na_seadec(x, algorithm = "locf"))) expect_false(anyNA(na_seadec(x, algorithm = "ma"))) expect_false(anyNA(na_seadec(x, algorithm = "mean"))) expect_false(anyNA(na_seadec(x, algorithm = "random"))) expect_false(anyNA(na_seadec(x))) }) test_that("No Seasonality in series", { x <- ts(c(3, 5, 6, 7, 8, 4, 5, 6, NA, NA, 5, 7, 4, 2, NA, NA, 5, 7, 8)) expect_false(anyNA(na_seadec(x, algorithm = "interpolation"))) expect_false(anyNA(na_seadec(x, algorithm = "kalman"))) expect_false(anyNA(na_seadec(x, algorithm = "locf"))) expect_false(anyNA(na_seadec(x, algorithm = "ma"))) expect_false(anyNA(na_seadec(x, algorithm = "mean"))) expect_false(anyNA(na_seadec(x, algorithm = "random"))) expect_false(anyNA(na_seadec(x))) }) test_that("Handling for no NAs", { x <- tsAirgapComplete expect_false(anyNA(na_seadec(x))) }) test_that("Correct results for all options with the tsAirgap dataset with find frequency", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- as.vector(tsAirgap) expect_equal(round(mean(na_seadec(x, algorithm = "interpolation", find_frequency = TRUE)), digits = 1), 280.4) expect_equal(round(mean(na_seadec(x, algorithm = "interpolation", find_frequency = FALSE)), digits = 1), 280.7) # Test if set frequencys are accepted as expected for find_frequency= F expect_equal(round(mean(na_seadec(ts(x, frequency = 2), algorithm = "interpolation", find_frequency = FALSE)), digits = 1), 280.8) # Test if find_frequency works expect_equal(round(mean(na_seadec(ts(x, frequency = 2), algorithm = "interpolation", find_frequency = TRUE)), digits = 1), 280.4) expect_equal(round(mean(na_seadec(x, algorithm = "locf", find_frequency = TRUE)), digits = 1), 279.7) expect_equal(round(mean(na_seadec(x, algorithm = "locf", find_frequency = FALSE)), digits = 1), 278.8) expect_equal(round(mean(na_seadec(x, algorithm = "mean", find_frequency = TRUE)), digits = 1), 279.5) expect_equal(round(mean(na_seadec(x, algorithm = "mean", find_frequency = FALSE)), digits = 1), 279.8) expect_true(round(mean(na_seadec(x, algorithm = "kalman", model = "auto.arima", find_frequency = TRUE)), digits = 1) > 277 & round(mean(na_seadec(x, algorithm = "kalman", model = "auto.arima", find_frequency = TRUE)), digits = 1) < 283) expect_equal(round(mean(na_seadec(x, algorithm = "ma", find_frequency = TRUE)), digits = 1), 280.6) expect_equal(round(mean(na_seadec(x, algorithm = "ma", find_frequency = FALSE)), digits = 1), 281.2) }) ================================================ FILE: tests/testthat/test-na_seasplit.R ================================================ context("na_seasplit") test_that("All NA vector throws error", { expect_error(na_seasplit(c(NA, NA, NA, NA, NA))) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at end)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[135:144] <- NA expect_equal(round(mean(na_seasplit(x, algorithm = "interpolation")), digits = 1), 276.6) expect_equal(round(mean(na_seasplit(x, algorithm = "locf")), digits = 1), 274.8) expect_equal(round(mean(na_seasplit(x, algorithm = "mean")), digits = 1), 264.0) expect_true(round(mean(na_seasplit(x, algorithm = "kalman", model = "auto.arima")), digits = 1) > 273 & round(mean(na_seasplit(x, algorithm = "kalman", model = "auto.arima")), digits = 1) < 277) expect_equal(round(mean(na_seasplit(x, algorithm = "ma")), digits = 1), 275.0) }) test_that("Correct results for all options with a modifed tsAirgap dataset (additionalNAs at start)", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap x[1:5] <- NA expect_equal(round(mean(na_seasplit(x, algorithm = "interpolation")), digits = 1), 280.5) expect_equal(round(mean(na_seasplit(x, algorithm = "locf")), digits = 1), 278.4) expect_equal(round(mean(na_seasplit(x, algorithm = "mean")), digits = 1), 283.4) expect_true(round(mean(na_seasplit(x, algorithm = "kalman", model = "auto.arima")), digits = 1) > 276 & round(mean(na_seasplit(x, algorithm = "kalman", model = "auto.arima")), digits = 1) < 280) expect_equal(round(mean(na_seasplit(x, algorithm = "ma")), digits = 1), 281.1) }) test_that("Correct results for all options with the tsAirgap dataset", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- tsAirgap expect_equal(round(mean(na_seasplit(x, algorithm = "interpolation")), digits = 1), 280.3) expect_equal(round(mean(na_seasplit(x, algorithm = "locf")), digits = 1), 278.2) expect_equal(round(mean(na_seasplit(x, algorithm = "mean")), digits = 1), 279.3) expect_true(round(mean(na_seasplit(x, algorithm = "kalman", model = "auto.arima")), digits = 1) > 277 & round(mean(na_seasplit(x, algorithm = "kalman", model = "auto.arima")), digits = 1) < 281) expect_equal(round(mean(na_seasplit(x, algorithm = "ma")), digits = 1), 280.3) }) test_that("Imputation works for data.frame", { # Checking if NAs remain in data.frame x <- data.frame(tsAirgap, tsAirgap, tsAirgapComplete) expect_false(anyNA(na_seasplit(x, algorithm = "interpolation"))) expect_false(anyNA(na_seasplit(x, algorithm = "locf"))) expect_false(anyNA(na_seasplit(x, algorithm = "mean"))) expect_false(anyNA(na_seasplit(x, algorithm = "random"))) expect_false(anyNA(na_seasplit(x, algorithm = "kalman"))) expect_false(anyNA(na_seasplit(x, algorithm = "ma"))) }) test_that("Warning for wrong input for algorithm parameter", { expect_error(na_seasplit(tsAirgap, algorithm = "wrong")) }) test_that("Test NA at beginning", { x <- tsAirgap x[1:2] <- NA expect_false(anyNA(na_seasplit(x, algorithm = "interpolation"))) expect_false(anyNA(na_seasplit(x, algorithm = "kalman"))) expect_false(anyNA(na_seasplit(x, algorithm = "locf"))) expect_false(anyNA(na_seasplit(x, algorithm = "ma"))) expect_false(anyNA(na_seasplit(x, algorithm = "mean"))) expect_false(anyNA(na_seasplit(x, algorithm = "random"))) expect_false(anyNA(na_seasplit(x))) }) test_that("Test NA at end", { x <- tsAirgap x[143:144] <- NA expect_false(anyNA(na_seasplit(x, algorithm = "interpolation"))) expect_false(anyNA(na_seasplit(x, algorithm = "kalman"))) expect_false(anyNA(na_seasplit(x, algorithm = "locf"))) expect_false(anyNA(na_seasplit(x, algorithm = "ma"))) expect_false(anyNA(na_seasplit(x, algorithm = "mean"))) expect_false(anyNA(na_seasplit(x, algorithm = "random"))) expect_false(anyNA(na_seasplit(x))) }) test_that("Multiple NAs in a row", { x <- tsAirgap x[40:80] <- NA expect_false(anyNA(na_seasplit(x, algorithm = "interpolation"))) expect_false(anyNA(na_seasplit(x, algorithm = "kalman"))) expect_false(anyNA(na_seasplit(x, algorithm = "locf"))) expect_false(anyNA(na_seasplit(x, algorithm = "ma"))) expect_false(anyNA(na_seasplit(x, algorithm = "mean"))) expect_false(anyNA(na_seasplit(x, algorithm = "random"))) expect_false(anyNA(na_seasplit(x))) }) test_that("Over 50% NAs", { x <- tsAirgap x[30:100] <- NA expect_false(anyNA(na_seasplit(x, algorithm = "interpolation"))) expect_false(anyNA(na_seasplit(x, algorithm = "kalman"))) expect_false(anyNA(na_seasplit(x, algorithm = "locf"))) expect_false(anyNA(na_seasplit(x, algorithm = "ma"))) expect_false(anyNA(na_seasplit(x, algorithm = "mean"))) expect_false(anyNA(na_seasplit(x, algorithm = "random"))) expect_false(anyNA(na_seasplit(x))) }) test_that("No Seasonality in series", { x <- ts(c(3, 5, 6, 7, 8, 4, 5, 6, NA, NA, 5, 7, 4, 2, NA, NA, 5, 7, 8)) expect_false(anyNA(na_seasplit(x, algorithm = "interpolation"))) expect_false(anyNA(na_seasplit(x, algorithm = "kalman"))) expect_false(anyNA(na_seasplit(x, algorithm = "locf"))) expect_false(anyNA(na_seasplit(x, algorithm = "ma"))) expect_false(anyNA(na_seasplit(x, algorithm = "mean"))) expect_false(anyNA(na_seasplit(x, algorithm = "random"))) expect_false(anyNA(na_seasplit(x))) }) test_that("Handling for no NAs", { x <- tsAirgapComplete expect_false(anyNA(na_seasplit(x))) }) test_that("Correct results for all options with the tsAirgap dataset for find_frequency", { skip_on_cran() # Using mean over resulting vector to check correctness # In order to avoid writing down the complete resulting vector # Using rounded version in order to avoid writing down all decimals x <- as.vector(tsAirgap) expect_equal(round(mean(na_seasplit(x, algorithm = "interpolation", find_frequency = TRUE)), digits = 1), 280.3) expect_equal(round(mean(na_seasplit(x, algorithm = "interpolation", find_frequency = FALSE)), digits = 1), 280.7) # Check that other frequencys lead to different result if find_frequency is FALSE expect_equal(round(mean(na_seasplit(ts(x, frequency = 2), algorithm = "interpolation", find_frequency = FALSE)), digits = 1), 281.7) # Check that find_frequency overrides frequency for find_frequency = T expect_equal(round(mean(na_seasplit(ts(x, frequency = 2), algorithm = "interpolation", find_frequency = TRUE)), digits = 1), 280.3) expect_equal(round(mean(na_seasplit(x, algorithm = "locf", find_frequency = TRUE)), digits = 1), 278.2) expect_equal(round(mean(na_seasplit(x, algorithm = "locf", find_frequency = FALSE)), digits = 1), 278.8) expect_equal(round(mean(na_seasplit(x, algorithm = "mean", find_frequency = TRUE)), digits = 1), 279.3) expect_equal(round(mean(na_seasplit(x, algorithm = "mean", find_frequency = FALSE)), digits = 1), 279.8) expect_true(round(mean(na_seasplit(x, algorithm = "kalman", model = "auto.arima", find_frequency = TRUE)), digits = 1) > 277 & round(mean(na_seasplit(x, algorithm = "kalman", model = "auto.arima", find_frequency = TRUE)), digits = 1) < 281) expect_equal(round(mean(na_seasplit(x, algorithm = "ma", find_frequency = TRUE)), digits = 1), 280.3) expect_equal(round(mean(na_seasplit(x, algorithm = "ma", find_frequency = FALSE)), digits = 1), 281.2) }) ================================================ FILE: tests/testthat/test-parameter-maxgap.R ================================================ context("maxgap") test_that("Test that function works and prints output", { x <- tsAirgap x[4] <- NA x[144] <- NA x[143] <- NA expect_equal(sum(is.na(na_mean(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_mean(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_mean(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_mean(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_mean(x, maxgap = 3))), 0) expect_equal(sum(is.na(na_locf(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_locf(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_locf(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_locf(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_locf(x, maxgap = 3))), 0) expect_equal(sum(is.na(na_random(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_random(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_random(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_random(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_random(x, maxgap = 3))), 0) expect_equal(sum(is.na(na_ma(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_ma(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_ma(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_ma(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_ma(x, maxgap = 3))), 0) expect_equal(sum(is.na(na_ma(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_ma(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_ma(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_ma(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_ma(x, maxgap = 3))), 0) expect_equal(sum(is.na(na_seadec(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_seadec(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_seadec(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_seadec(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_seadec(x, maxgap = 3))), 0) expect_equal(sum(is.na(na_seasplit(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_seasplit(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_seasplit(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_seasplit(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_seasplit(x, maxgap = 3))), 0) expect_equal(sum(is.na(na_kalman(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_kalman(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_kalman(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_kalman(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_kalman(x, maxgap = 3))), 0) expect_equal(sum(is.na(na_interpolation(x, maxgap = 1))), 7) expect_equal(sum(is.na(na_interpolation(x, maxgap = 0))), 16) expect_equal(sum(is.na(na_interpolation(x, maxgap = -1))), 0) expect_equal(sum(is.na(na_interpolation(x, maxgap = 2))), 3) expect_equal(sum(is.na(na_interpolation(x, maxgap = 3))), 0) }) ================================================ FILE: tests/testthat/test-statsNA.R ================================================ context("statsNA") test_that("Test that function works and prints output", { expect_output(statsNA(tsAirgap, print_only = T)) }) test_that("Test results of function", { expect_equal(statsNA(tsAirgap, print_only = F)$number_NAs, 13) expect_equal(statsNA(tsAirgap, print_only = F)$longest_na_gap, 3) expect_equal(statsNA(tsAirgap, print_only = F)$most_weighty_na_gap, 1) expect_equal(statsNA(tsAirgap, print_only = F)$most_frequent_na_gap, 1) expect_equal(statsNA(tsAirgap, print_only = F)$length_series, 144) expect_equal(statsNA(tsAirgap, print_only = F)$number_na_gaps, 11) expect_true(statsNA(tsNH4, print_only = F)$average_size_na_gaps > 5) }) ================================================ FILE: tests/testthat.R ================================================ library(testthat) test_check("imputeTS") ================================================ FILE: vignettes/Cheat_Sheet_imputeTS.pdf.asis ================================================ %\VignetteIndexEntry{Cheat Sheet imputeTS} %\VignetteEngine{R.rsp::asis} %\VignetteKeyword{PDF} %\VignetteKeyword{HTML} %\VignetteKeyword{vignette} %\VignetteKeyword{package} ================================================ FILE: vignettes/RJournal.sty ================================================ % Package `RJournal' to use with LaTeX2e % Copyright (C) 2010 by the R Foundation % Copyright (C) 2013 by the R Journal % % Originally written by Kurt Hornik and Friedrich Leisch with subsequent % edits by the editorial board \NeedsTeXFormat{LaTeX2e}[1995/12/01] \ProvidesPackage{RJournal}[2013/08/27 v0.13 RJournal package] \RequirePackage{tikz} % Overall page layout, fonts etc ----------------------------------------------- % Issues of of \emph{The R Journal} are created from the standard \LaTeX{} % document class \pkg{report}. \RequirePackage{geometry} \geometry{a4paper, textwidth=14cm, top=1cm, bottom=1cm, includehead,includefoot,centering, footskip=1.5cm} \raggedbottom \RequirePackage{fancyhdr} \fancyhead{} \fancyheadoffset{2cm} \fancyhead[L]{\textsc{\RJ@sectionhead}} \fancyhead[R]{\thepage} \fancyfoot{} \fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} \fancyfoot[R]{ISSN 2073-4859} \pagestyle{fancy} % We use the following fonts (all with T1 encoding): % % rm & palatino % tt & inconsolata % sf & helvetica % math & palatino \RequirePackage{microtype} \RequirePackage[scaled=0.92]{helvet} \RequirePackage{palatino,mathpazo} \RequirePackage[scaled=1.02]{inconsolata} \RequirePackage[T1]{fontenc} \RequirePackage[hyphens]{url} \RequirePackage[pagebackref]{hyperref} \renewcommand{\backref}[1]{[p#1]} % Dark blue colour for all links \RequirePackage{color} \definecolor{link}{rgb}{0.45,0.51,0.67} \hypersetup{ colorlinks,% citecolor=link,% filecolor=link,% linkcolor=link,% urlcolor=link } % Give the text a little room to breath \setlength{\parskip}{3pt} \RequirePackage{setspace} \setstretch{1.05} % Issue and article metadata --------------------------------------------------- % Basic front matter information about the issue: volume, number, and % date. \newcommand{\volume}[1]{\def\RJ@volume{#1}} \newcommand{\volnumber}[1]{\def\RJ@number{#1}} \renewcommand{\month}[1]{\def\RJ@month{#1}} \renewcommand{\year}[1]{\def\RJ@year{#1}} % Individual articles correspond to % chapters, and are contained in |article| environments. This makes it % easy to have figures counted within articles and hence hyperlinked % correctly. % An article has an author, a title, and optionally a subtitle. We use % the obvious commands for specifying these. Articles will be put in certain % journal sections, named by \sectionhead. \newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} \renewcommand{\author} [1]{\def\RJ@author{#1}} \renewcommand{\title} [1]{\def\RJ@title{#1}} \newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} % Control appearance of titles: make slightly smaller than usual, and % suppress section numbering. See http://tex.stackexchange.com/questions/69749 % for why we don't use \setcounter{secnumdepth}{-1} \usepackage[medium]{titlesec} \usepackage{titletoc} \titleformat{\section} {\normalfont\large\bfseries}{}{0em}{} \titleformat{\subsection}{\normalfont\normalsize\bfseries}{}{0em}{} \titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} % Article layout --------------------------------------------------------------- % Environment |article| clears the article header information at its beginning. % We use |\FloatBarrier| from the placeins package to keep floats within % the article. \RequirePackage{placeins} \newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} % Refereed articles should have an abstract, so we redefine |\abstract| to % give the desired style \renewcommand{\abstract}[1]{% \setstretch{1}% \noindent% \small% \textbf{Abstract} #1 } % The real work is done by a redefined version of |\maketitle|. Note % that even though we do not want chapters (articles) numbered, we % need to increment the chapter counter, so that figures get correct % labelling. \renewcommand{\maketitle}{% \noindent \chapter{\RJ@title}\refstepcounter{chapter} \ifx\empty\RJ@subtitle \else \noindent\textbf{\RJ@subtitle} \par\nobreak\addvspace{\baselineskip} \fi \ifx\empty\RJ@author \else \noindent\textit{\RJ@author} \par\nobreak\addvspace{\baselineskip} \fi \@afterindentfalse\@nobreaktrue\@afterheading } % Now for some ugly redefinitions. We do not want articles to start a % new page. (Actually, we do, but this is handled via explicit % \newpage % % The name@of@eq is a hack to get hyperlinks to equations to work % within each article, even though there may be multiple eq.(1) % \begin{macrocode} \renewcommand\chapter{\secdef\RJ@chapter\@schapter} \providecommand{\nohyphens}{% \hyphenpenalty=10000\exhyphenpenalty=10000\relax} \newcommand{\RJ@chapter}{% \edef\name@of@eq{equation.\@arabic{\c@chapter}}% \renewcommand{\@seccntformat}[1]{}% \@startsection{chapter}{0}{0mm}{% -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% \phantomsection\normalfont\huge\bfseries\raggedright}} % Book reviews should appear as sections in the text and in the pdf bookmarks, % however we wish them to appear as chapters in the TOC. Thus we define an % alternative to |\maketitle| for reviews. \newcommand{\review}[1]{ \pdfbookmark[1]{#1}{#1} \section*{#1} \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} } % We want bibliographies as starred sections within articles. % \RequirePackage[sectionbib,round]{natbib} \bibliographystyle{abbrvnat} % Equations, figures and tables are counted within articles, but we do % not show the article number. For equations it becomes a bit messy to avoid % having hyperref getting it wrong. % \numberwithin{equation}{chapter} \renewcommand{\theequation}{\@arabic\c@equation} \renewcommand{\thefigure}{\@arabic\c@figure} \renewcommand{\thetable}{\@arabic\c@table} % Issue layout ----------------------------------------------------------------- % Need to provide our own version of |\tableofcontents|. We use the % tikz package to get the rounded rectangle. Notice that |\section*| % is really the same as |\chapter*|. \renewcommand{\contentsname}{Contents} \renewcommand\tableofcontents{% \vspace{1cm} \section*{\contentsname} { \@starttoc{toc} } } \renewcommand{\titlepage}{% \thispagestyle{empty} \hypersetup{ pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% pdfauthor={R Foundation for Statistical Computing},% } \noindent \begin{center} \fontsize{50pt}{50pt}\selectfont The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} Journal \end{center} {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} \rule{\textwidth}{1pt} \begin{center} {\Large A peer-reviewed, open-access publication of the \\ R Foundation for Statistical Computing} \end{center} % And finally, put in the TOC box. Note the way |tocdepth| is adjusted % before and after producing the TOC: thus, we can ensure that only % articles show up in the printed TOC, but that in the PDF version, % bookmarks are created for sections and subsections as well (provided % that the non-starred forms are used). \setcounter{tocdepth}{0} \tableofcontents \setcounter{tocdepth}{2} \clearpage } % Text formatting -------------------------------------------------------------- \newcommand{\R}{R} \newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} \newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} % Simple font selection is not good enough. For example, |\texttt{--}| % gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we % need to turn off ligatures, which currently only happens for commands % |\code| and |\samp| and the ones derived from them. Hyphenation is % another issue; it should really be turned off inside |\samp|. And % most importantly, \LaTeX{} special characters are a nightmare. E.g., % one needs |\~{}| to produce a tilde in a file name marked by |\file|. % Perhaps a few years ago, most users would have agreed that this may be % unfortunate but should not be changed to ensure consistency. But with % the advent of the WWW and the need for getting `|~|' and `|#|' into % URLs, commands which only treat the escape and grouping characters % specially have gained acceptance \DeclareRobustCommand\code{\bgroup\@noligs\@codex} \def\@codex#1{\texorpdfstring% {{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% {#1}\egroup} \newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} \newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} \DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} \def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} \newcommand{\var}[1]{{\normalfont\textsl{#1}}} \let\env=\code \newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} \let\command=\code \let\option=\samp \newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} % \acronym is effectively disabled since not used consistently \newcommand{\acronym}[1]{#1} \newcommand{\strong}[1]{\texorpdfstring% {{\normalfont\fontseries{b}\selectfont #1}}% {#1}} \let\pkg=\strong \newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% \let\cpkg=\CRANpkg \newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} \newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} % Example environments --------------------------------------------------------- \RequirePackage{fancyvrb} \RequirePackage{alltt} \DefineVerbatimEnvironment{example}{Verbatim}{} \renewenvironment{example*}{\begin{alltt}}{\end{alltt}} % Support for output from Sweave, and generic session style code % These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex % won't use a condensed font in that case. % Update (2015-05-28 by DS): remove fontsize=\small to match example environment \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \DefineVerbatimEnvironment{Sin}{Verbatim}{} \DefineVerbatimEnvironment{Sout}{Verbatim}{} \newenvironment{Schunk}{}{} % Mathematics ------------------------------------------------------------------ % The implementation of |\operatorname| is similar to the mechanism % \LaTeXe{} uses for functions like sin and cos, and simpler than the % one of \AmSLaTeX{}. We use |\providecommand| for the definition in % order to keep the one of the \pkg{amstex} if this package has % already been loaded. % \begin{macrocode} \providecommand{\operatorname}[1]{% \mathop{\operator@font#1}\nolimits} \RequirePackage{amsfonts} \renewcommand{\P}{% \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} \newcommand{\E}{% \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} \newcommand{\VAR}{\operatorname{var}} \newcommand{\COV}{\operatorname{cov}} \newcommand{\COR}{\operatorname{cor}} % Figures ---------------------------------------------------------------------- \RequirePackage[font=small,labelfont=bf]{caption} % Wide environments for figures and tables ------------------------------------- \RequirePackage{environ} % An easy way to make a figure span the full width of the page \NewEnviron{widefigure}[1][]{ \begin{figure}[#1] \advance\leftskip-2cm \begin{minipage}{\dimexpr\textwidth+4cm\relax}% \captionsetup{margin=2cm} \BODY \end{minipage}% \end{figure} } \NewEnviron{widetable}[1][]{ \begin{table}[#1] \advance\leftskip-2cm \begin{minipage}{\dimexpr\textwidth+4cm\relax}% \captionsetup{margin=2cm} \BODY \end{minipage}% \end{table} } ================================================ FILE: vignettes/gallery_visualizations.Rmd ================================================ --- title: "Gallery: Times Series Missing Data Visualizations" author: "Steffen Moritz" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Gallery: Times Series Missing Data Visualizations} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r knitr-setup, include = FALSE} knitr::opts_chunk$set(fig.align = "center", fig.width = 6, fig.height = 4, dpi = 100) ``` There are multiple different plots for (univarate) time series missing data available in the imputeTS package. These can be grouped in the following three categories: - Getting an Overview (`ggplot_na_distribution`) - Deeper insights and missing data patterns (`ggplot_na_distribution2`, `ggplot_na_gapsize`) - Assessing/Exploring imputation results (`ggplot_na_imputations`) *This vignette showcases all of the available visualizations in the imputeTS package. More information on time series imputation and the imputeTS package in general can be found in this paper: [imputeTS: Time Series Missing Value Imputation in R](https://journal.r-project.org/archive/2017/RJ-2017-009/index.html).* ## Getting a first overview (`ggplot_na_distribution`) The best starting point for getting an overview about the missing data in your (univariate) time series is the `ggplot_na_distribution()` plot. It gives a nice first overview where in the time series the missing values occur and how they are distributed. It also already gives a rough impression on how many missing data are in different intervals of the time series. Usage is easy: just supply the (univariate) time series to the function call. Only the time series is needed as input - all additional parameters are only needed to alter the appearance of the plot. ```{r ggplot-na-distribution-example1, message=FALSE} library("imputeTS") ggplot_na_distribution(tsAirgap) ``` It is important to note, that the input itself needs to be univariate. For data types with multiple variables/columns only use the column you want to plot as input parameter `x`. The x-axis time information can be added with the `x_axis_labels` parameter - otherwise the consecutive index of observations in the series is used as x-axis tick label. Thus for a data.frame `df` with multiple columns `df$date`, `df$value`, `df$another_value`, `df$yet_another_value` where we want to plot `df$value` with Dates on the x-axis the required function call would look like this: ```{r create-df, results=F, echo = F, warning=FALSE, fig.show='hide'} df <- structure(list(date = structure(c(-21185, -20819, -20454, -20089, -19724, -19358, -18993, -18628, -18263, -17897, -17532, -17167, -16802, -16436, -16071, -15706, -15341, -14975, -14610, -14245, -13880, -13514, -13149, -12784, -12419, -12053, -11688, -11323, -10958, -10592, -10227, -9862, -9497, -9131, -8766, -8401, -8036, -7670, -7305, -6940, -6575, -6209, -5844, -5479, -5114, -4748, -4383, -4018, -3653, -3287, -2922, -2557, -2192, -1826, -1461, -1096, -731, -365, 0, 365), class = "Date"), value = structure(c(48.2, 50.5, 49.4, 51.1, 49.4, 47.9, 49.8, 50.9, 49.3, 51.9, 50.8, 49.6, 49.3, 50.6, 48.4, 50.7, 50.9, 50.6, 51.5, 52.8, 51.8, 51.1, 49.8, 50.2, 50.4, NA, NA, NA, 48.8, 51.7, 51, 50.6, 51.7, 51.5, 52.1, 51.3, 51, 54, 51.4, 52.7, 53.1, 54.6, NA, 52, 50.9, 52.6, 50.2, 52.6, 51.6, 51.9, 50.5, 50.9, 51.7, 51.4, 51.7, 50.8, 51.9, 51.8, 50.0, 49.1 ), .Tsp = c(1912, 1971, 1), class = "ts")), class = "data.frame", row.names = c(NA, -60L)) ``` ```{r ggplot-na-distribution-example2} ggplot_na_distribution(x = df$value, x_axis_labels = df$date) ``` ## Detailed information about certain intervals (`ggplot_na_distribution2`) When a summary for certain time intervals (e. g. weeks) is needed, the `ggplot_na_distribution2()` plot is useful. It shows the missing data percentage for each interval as a bar. This kind of summary plot is also quite useful for very long time series, which would not fit into the plot window as a lineplot. Like for `ggplot_na_distribution()` only parameter `x` (the univariate time series) is mandatory for creating a plot with `ggplot_na_distribution2()`. With the parameter `interval_size` the size of the interval can be changed (default is a auto calculated interval size that gives a good overall overview). All other parameters are mostly needed for changing the appearance of the plot. ```{r ggplot-na-distribution2-example1} ggplot_na_distribution2(tsNH4) ``` Alternatively the missing data count for the interval (instead of the percentages) can be shown. Below is an example with a custom interval size of 144 and a custom color for the missing data bars. Since the example data is recorded in 10 minute time steps, a interval_size of 144 means that we are using daily intervals (6 measurements per hour, 24 hours per day, 6*24 = 144). ```{r ggplot-na-distribution2-example2} ggplot_na_distribution2(tsNH4, measure = "count", interval_size = 144, color_missing = "gold3") ``` ## Insights about missing data patterns (`ggplot_na_gapsize`) Often deeper insights about the missing data are quite useful. These insights can give hints of possible causes of the missing data and an indication, which imputation algorithms might give good results. The plot gives an overview about how often different gapsizes (NAs in a row) occur in the time series. Only the parameter `x` (the univariate time series) is needed as mandatory input. By default the plot shows only the 10 most often occurring gapsizes. Use parameter `limit` to increase this number. ```{r ggplot-na-gapsize-example1} library(imputeTS) ggplot_na_gapsize(tsNH4) ``` The plot shows both, the number of occurrence and the resulting NAs for the respective gapsizes. Resulting NAs can be explained as the number of NAs a certain gapsize accounts for in total. For example a gapsize of 3 that occurs 5 times results in 15 NAs overall. The parameter `include_total` can be used to change this behavior. Below is a example of the same plot with specific settings for `limit` and `include_total`. ```{r ggplot-na-gapsize-example2} library(imputeTS) ggplot_na_gapsize(tsNH4, include_total = F, limit = 15) ``` ## Evaluate imputation results (`ggplot_na_imputations`) After using imputation functions like `na_kalman()`, `na_interpolation()`, `na_seadec()` there is often the need to get a first impression on how good the algorithm performs. The `ggplot_na_imputations()` plot gives a good impression on how well the imputed values fit into the original time series. Mandatory inputs for this function are these two parameters: `x_with_na` (the time series as it was before imputation) and `x_with_imputations` (the time series without NAs after imputation). ```{r ggplot-na-imputations-example1} library(imputeTS) imp <- na_interpolation(tsAirgap) ggplot_na_imputations(tsAirgap, imp) ``` In some cases (mostly when performing imputation experiments and benchmarks) the NAs were only artificially introduced into the original time series. Which means, there exists a ground truth for the NA values (the complete time series before introducing the NAs). In this case you can additionally use the `x_with_truth` parameter to get a plot that displays both, the imputations and the ground truth. ```{r ggplot-na-imputations-example2} library(imputeTS) imp <- na_mean(tsAirgap) ggplot_na_imputations(x_with_na = tsAirgap, x_with_imputations = imp, x_with_truth = tsAirgapComplete ) ``` ## Support If you found a bug or have suggestions, feel free to open an issue on GitHub or get in contact via steffen.moritz10 at gmail.com. > All feedback is welcome ================================================ FILE: vignettes/imputeTS-Time-Series-Missing-Value-Imputation-in-R.ltx ================================================ %\VignetteIndexEntry{imputeTS: Time Series Missing Value Imputation in R} %\VignetteEngine{R.rsp::tex} \documentclass[a4paper]{report} \usepackage{etex} \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage{RJournal} \usepackage{amsmath,amssymb,array} \usepackage{booktabs} \usepackage{textcomp} \begin{document} %% do not edit, for illustration only \sectionhead{Contributed research article} \volume{XX} \volnumber{YY} \year{20ZZ} \month{AAAA} %% replace RJtemplate with your article \begin{article} %-------------------------------------- % !TeX root = RJwrapper.tex \title{\pkg{imputeTS}: Time Series Missing Value Imputation in R} \author{by Steffen Moritz and Thomas Bartz-Beielstein} \maketitle %================================================================================ \abstract{ The \pkg{imputeTS} package specializes on univariate time series imputation. It offers multiple state-of-the-art imputation algorithm implementations along with plotting functions for time series missing data statistics. While imputation in general is a well-known problem and widely covered by R packages, finding packages able to fill missing values in univariate time series is more complicated. The reason for this lies in the fact, that most imputation algorithms rely on inter-attribute correlations, while univariate time series imputation instead needs to employ time dependencies. This paper provides an introduction to the \pkg{imputeTS} package and its provided algorithms and tools. Furthermore, it gives a short overview about univariate time series imputation in R. } %================================================================================ \section{Introduction} \label{sec:Introduction} In almost every domain from industry \citep{billinton1996time} to biology \citep{bar2003continuous}, finance \citep{taylor2007modelling} up to social science \citep{gottman1981time} different time series data are measured. While the recorded datasets itself may be different, one common problem are missing values. Many analysis methods require missing values to be replaced with reasonable values up-front. In statistics this process of replacing missing values is called \textit{imputation}. \noindent\\ Time series imputation thereby is a special sub-field in the imputation research area. Most popular techniques like Multiple Imputation \citep{rubin1987multiple}, Expectation-Maximization \citep{dempster1977maximum}, Nearest Neighbor \citep{vacek1980examination} and Hot Deck \citep{ford1983overview} rely on inter-attribute correlations to estimate values for the missing data. Since univariate time series do not possess more than one attribute, these algorithms cannot be applied directly. Effective univariate time series imputation algorithms instead need to employ the inter-time correlations. \noindent\\ On CRAN there are several packages solving the problem of imputation of multivariate data. Most popular and mature (among others) are \CRANpkg{AMELIA} \citep{AMELIA}, \CRANpkg{mice} \citep{mice}, \CRANpkg{VIM} \citep{VIM} and \CRANpkg{missMDA} \citep{missMDA}. However, since these packages are designed for multivariate data imputation only they do not work for univariate time series. \noindent\\ At the moment \CRANpkg{imputeTS} \citep{imputeTS} is the only package on CRAN that is solely dedicated to univariate time series imputation and includes multiple algorithms. Nevertheless, there are some other packages that include imputation functions as addition to their core package functionality. Most noteworthy being \CRANpkg{zoo} \citep{zoo} and \CRANpkg{forecast} \citep{forecast}. Both packages offer also some advanced time series imputation functions. The packages \CRANpkg{spacetime} \citep{spacetime}, \CRANpkg{timeSeries} \citep{timeSeries} and \CRANpkg{xts} \citep{xts} should also be mentioned, since they contain some very simple but quick time series imputation methods. For a broader overview about available time series imputation packages in R see also \citep{Moritz15a}. In this technical report we evaluate the performance of several univariate imputation functions in R on different time series. \noindent\\ This paper is structured as follows: Section~\nameref{sec:Overview imputeTS package} gives an overview, about all features and functions included in the \pkg{imputeTS} package. This is followed by~\nameref{sec:Usage examples} of the different provided functions. The paper ends with a~\nameref{sec:Conclusions} section. %=============================================================================== \section{Overview \pkg{imputeTS} package} \label{sec:Overview imputeTS package} The \pkg{imputeTS} package can be found on CRAN and is an easy to use package that offers several utilities for \dfn{'univariate, equi-spaced, numeric time series'}. \noindent Univariate means there is just one attribute that is observed over time. Which leads to a sequence of single observations $o_{1}$, $o_{2}$, $o_{3}$, ... $o_{n}$ at successive points $t_{1}$, $t_{2}$, $t_{3}$, ... $t_{n}$ in time. Equi-spaced means, that time increments between successive data points are equal $|t_{1} - t_{2}| = |t_{2} - t_{3}| = ... = |t_{n-1} - t_{n}|$. Numeric means that the observations are measurable quantities that can be described as a number. \noindent In the first part of this section, a general overview about all available functions and datasets is given. This is followed by more detailed overviews about the three areas covered by the package: 'Plots \& Statistics', 'Imputation' and 'Datasets'. Information about how to apply these functions and tools can be found later in the ~\nameref{sec:Usage examples} section. \noindent\\ \subsection{General overview} As can be seen in Table~\ref{tab:generaloverview}, beyond several imputation algorithm implementations the package also includes plotting functions and datasets. The imputation algorithms can be divided into rather simple but fast approaches like mean imputation and more advanced algorithms that need more computation time like kalman smoothing on a structural model. \begin{table}[h] \centering \begin{tabular}{@{}llll@{}} \toprule \textbf{Simple Imputation} & \textbf{Imputation} & \textbf{Plots \& Statistics} & \textbf{Datasets} \\ \midrule na\_locf & na\_interpolation & ggplot\_na\_distribution & tsAirgap \\ na\_mean & na\_kalman & ggplot\_na\_distribution2 & tsAirgapComplete \\ na\_random & na\_ma & ggplot\_na\_gapsize & tsHeating \\ na\_replace & na\_seadec & ggplot\_na\_imputations & tsHeatingComplete \\ na\_remove & na\_seasplit & statsNA & tsNH4 \\ & & & tsNH4Complete \\ \bottomrule \end{tabular} \caption{General Overview imputeTS package} \label{tab:generaloverview} \end{table} \noindent As a whole, the package aims to support the user in the complete process of replacing missing values in time series. This process starts with analyzing the distribution of the missing values using the \code{statsNA} function and the plots of \code{ggplot\_na\_distribution}, \code{ggplot\_na\_intervals}, \code{ggplot\_na\_gapsize}. In the next step the actual imputation can take place with one of the several algorithm options. Finally, the imputation results can be visualized with the \code{ggplot\_na\_imputations} function. Additionally, the package contains three datasets, each in a version with and without missing values, that can be used to test imputation algorithms. \noindent \\ \subsection{Plots \& Statistics functions} An overview about the available plots and statistics functions can be found in Table~\ref{tab:overviewstatistics}. To get a good impression what the plots look like section ~\nameref{sec:Usage examples} is recommended. \begin{table}[h] \centering \begin{tabular}{@{}ll@{}} \toprule \textbf{Function} & \textbf{Description} \\ \midrule ggplot\_na\_distribution & Visualize Distribution of Missing Values \\ ggplot\_na\_distribution2 & Visualize Distribution of Missing Values (Barplot) \\ ggplot\_na\_gapsize & Visualize Distribution of NA gap sizes \\ ggplot\_na\_imputations & Visualize Imputed Values \\ statsNA & Print Statistics about the Missing Data \\ \bottomrule \end{tabular} \caption{Overview Plots \& Statistics} \label{tab:overviewstatistics} \end{table} \noindent The \code{statsNA} function calculates several missing data statistics of the input data. This includes overall percentage of missing values, absolute amount of missing values, amount of missing value in different sections of the data, longest series of consecutive NAs and occurrence of consecutive NAs. The \code{ggplot\_na\_distribution} function visualizes the distribution of NAs in a time series. This is done using a standard time series plot, in which areas with missing data are colored red. This enables the user to see at first sight where in the series most of the missing values are located. The \code{ggplot\_na\_intervals} function provides the same insights to users, but is designed for very large time series. This is necessary for time series with 1000 and more observations, where it is not possible to plot each observation as a single point. The \code{ggplot\_na\_gapsize} function provides information about consecutive NAs by showing the most common NA gap sizes in the time series. The \code{ggplot\_na\_imputations} function is designated for visual inspection of the results after applying an imputation algorithm. Therefore, newly imputed observations are shown in a different color than the rest of the series. \noindent \\ \subsection{Imputation functions} An overview about all available imputation algorithms can be found in Table~\ref{tab:overviewimputation}. Even if these functions are really easy applicable, some examples can be found later in section~\nameref{sec:Usage examples}. More detailed information about the theoretical background of the algorithms can be found in the \pkg{imputeTS} manual \citep{imputeTSmanual}. \begin{table}[h] \centering \begin{tabular}{@{}lll@{}} \toprule \textbf{Function} & \textbf{Option} & \textbf{Description} \\ \midrule na\_interpolation & & \\ & linear & Imputation by Linear Interpolation \\ & spline & Imputation by Spline Interpolation \\ & stine & Imputation by Stineman Interpolation \\ na\_kalman & & \\ & StructTS & Imputation by Structural Model \& Kalman Smoothing \\ & auto.arima & Imputation by ARIMA State Space Representation \& Kalman Sm. \\ na\_locf & & \\ & locf & Imputation by Last Observation Carried Forward \\ & nocb & Imputation by Next Observation Carried Backward \\ na\_ma & & \\ & simple & Missing Value Imputation by Simple Moving Average \\ & linear & Missing Value Imputation by Linear Weighted Moving Average \\ & exponential & Missing Value Imputation by Exponential Weighted Moving Average \\ na\_mean & & \\ & mean & MissingValue Imputation by Mean Value \\ & median & Missing Value Imputation by Median Value \\ & mode & Missing Value Imputation by Mode Value \\ na\_random & & Missing Value Imputation by Random Sample \\ na\_replace & & Replace Missing Values by a Defined Value \\ \midrule na\_seadec & & Seasonally Decomposed Missing Value Imputation \\ na\_seasplit & & Seasonally Splitted Missing Value Imputation \\ \midrule na\_remove & & Remove Missing Values \\ \bottomrule \end{tabular} \caption{Overview Imputation Algorithms} \label{tab:overviewimputation} \end{table} \FloatBarrier \noindent For convenience similar algorithms are available under one function name as parameter option. For example linear, spline and stineman interpolation are all included in the \code{na\_interpolation} function. The \code{na\_mean}, \code{na\_locf}, \code{na\_replace}, \code{na\_random} functions are all simple and fast. In comparison, \code{na\_interpolation}, \code{na\_kalman}, \code{na\_ma}, \code{na\_seasplit}, \code{na\_seadec} are more advanced algorithms that need more computation time. The \code{na\_remove} function is a special case, since it only deletes all missing values. Thus, it is not really an imputation function. It should be handled with care since removing observations may corrupt the time information of the series. The \code{na\_seasplit} and \code{na\_seadec} functions are as well exceptions. These perform seasonal split / decomposition operations as a preprocessing step. For the imputation itself, one out of the other imputation algorithms can be used (which one can be set as option). Looking at all available imputation methods, no single overall best method can be pointed out. Imputation performance is always very dependent on the characteristics of the input time series. Even imputation with mean values can sometimes be an appropriate method. For time series with a strong seasonality usually \code{na\_kalman} and \code{na\_seadec} / \code{na\_seasplit} perform best. In general, for most time series one algorithm out of \code{na\_kalman}, \code{na\_interpolation} and \code{na\_seadec} will yield the best results. Meanwhile, \code{na\_random}, \code{na\_mean}, \code{na\_locf} will be at the lower end accuracy wise for the majority of input time series. \noindent \subsection{Datasets} \label{subsec:Datasets} As can be seen in Table~\ref{tab:overviewdatasets}, all three datasets are available in a version with missing data and in a complete version. The provided time series are designated as benchmark datasets for univariate time series imputation. They shall enable users to quickly compare and test imputation algorithms. Without these datasets the process of testing time series imputation algorithms would require to manually delete certain observations. The benchmark data simplifies this: imputation algorithms can directly be applied to the dataset versions with missing values, which then can be compared to the complete dataset versions afterwards. Since the time series are specified, researchers can use these to compare their algorithms against each other. \noindent\\ Reached RMSE or MAPE values on these datasets are easily understandable results to quote and compare against. Nevertheless, comparing algorithms using these fixed datasets can only be a first indicator of how well algorithms perform in general. Especially for the very short \code{tsAirgap} series (with just 13 NA values) random lucky guesses can considerably influence the results. A complete benchmark would include: 'Different missing data percentages', 'Different datasets', 'Different random seeds for missing data simulation'. \noindent\\ Overall there is a relatively small time series provided in \code{tsAirgap}, a medium one in \code{tsNH4} and a large time series in \code{tsHeating}. The \code{tsHeating} and \code{tsNH4} are both sensor data, while \code{tsAirgap} is count data. \begin{table}[h] \centering \begin{tabular}{@{}ll@{}} \toprule \textbf{Dataset} & \textbf{Description} \\ \midrule tsAirgap & Time series of monthly airline passengers (with NAs) \\ tsAirgapComplete & Time series of monthly airline passengers (complete) \\ tsHeating & Time series of a heating systems' supply temperature (with NAs) \\ tsHeatingComplete & Time series of a heating systems' supply temperature (complete) \\ tsNH4 & Time series of NH4 concentration in a waste-water system (with NAs) \\ tsNH4Complete & Time series of NH4 concentration in a waste-water system (complete) \\ \bottomrule \end{tabular} \caption{Overview Datasets} \label{tab:overviewdatasets} \end{table} \noindent \strong{tsAirgap} \noindent\\ The \code{tsAirgap} time series has 144 rows and the incomplete version includes 14 NA values. It represents the monthly totals of international airline passengers from 1949 to 1960. The time series originates from \cite{Box76a} and is a commonly used example in time series analysis literature. Originally known as 'AirPassengers' or 'airpass' this version is renamed to 'tsAirgap' in order improve differentiation from the complete series (gap signifies that NAs were introduced). The characteristics (strong trend, strong seasonal behavior) make the \code{tsAirgap} series a great example for time series imputation. \noindent As already mentioned in order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values (\code{tsAirgapComplete}), which can be used as ground truth. Another series with NAs, on which the imputation algorithms can be applied (\code{tsAirgap}). While the missing data for \code{tsNH4} and \code{tsHeating} were each introduced according to patterns observed in very similar time series from the same source, the missing observations in \code{tsAirgap} were created based on general missing data patterns. \noindent\\ \strong{tsNH4} \noindent\\ The \code{tsNH4} time series has 4552 rows and the incomplete version includes 883 NA values. It represents the NH4 concentration in a waste-water system measured from 30.11.2010 - 16:10 to 01.01.2011 - 6:40 in 10 minute steps. The time series is derived from the dataset of the Genetic and Evolutionary Computation Conference (GECCO) Industrial Challenge 2014~\footnote{\url{http://www.spotseven.de/gecco-challenge/gecco-challenge-2014/}}. \noindent As already mentioned in order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values (\code{tsNH4Complete}), which can be used as ground truth. Another series with NAs (\code{tsNH4}), on which the imputation algorithms can be applied. The pattern for the NA occurrence was derived from the same series / sensors, but from an earlier time interval. Thus, it is a very realistic missing data pattern. Beware, since the time series has a lot of observations, some of the more complex algorithms like \code{na\_kalman} will need some time till they are finished. \noindent\\ \strong{tsHeating} \noindent\\ The \code{tsHeating} time series has 606837 rows and the incomplete version includes 57391 NA values. It represents a heating systems' supply temperature measured from 18.11.2013 - 05:12:00 to 13.01.2015 - 15:08:00 in 1 minute steps. The time series originates from the GECCO Industrial Challenge 2015~\footnote{\url{http://www.spotseven.de/gecco-challenge/gecco-challenge-2015/}}. This was a challenge about 'Recovering missing information in heating system operating data'. Goal was to impute missing values in heating system sensor data as accurate as possible. \noindent As already mentioned in order to use this series for comparing imputation algorithm results, there are two time series provided. One series without missing values (\code{tsHeatingComplete}), which can be used as ground truth. Another series with NAs (\code{tsHeating}), on which the imputation algorithms can be applied. The NAs thereby were inserted according to patterns found in similar time series. According to patterns found / occurring in other heating systems. Beware, since it is a very large time series, some of the more complex algorithms like \code{na\_kalman} may need up to several days to complete on standard hardware. %================================================================================ \section{Usage examples} \label{sec:Usage examples} To start working with the \pkg{imputeTS} package, install either the stable version from CRAN or the development version from GitHub (\url{https://github.com/SteffenMoritz/imputeTS}). The stable version from CRAN is hereby recommended. \subsection{Imputation algorithms} All imputation algorithms are used the same way. Input has to be either a numeric time series or a numeric vector. As output, a version of the input data with all missing values replaced by imputed values is returned. Here is a small example, to show how to use the imputation algorithms. (all imputation functions start with na\_'algorithm name')\\\\ \noindent For this we first need to create an example input series with missing data. \begin{example} # Create a short example time series with missing values x <- ts(c(1, 2, 3, 4, 5, 6, 7, 8, NA, NA, 11, 12)) \end{example} \noindent On this time series we can apply different imputation algorithms. We start with using \code{na\_mean}, which substitutes the NAs with mean values. \begin{example} # Impute the missing values with na_mean na_mean(x) \end{example} [1]~~~~1.0~~~~2.0~~~~3.0~~~~4.0~~~~5.0~~~~6.0~~~~7.0~~~~8.0 \textbf{~~~~5.9~~~~5.9}~~~~11.0~~~~12.0 \noindent\\ Most of the functions also have additional options that provide further algorithms (of the same algorithm category). In the example below it can be seen that \code{na\_mean} can also be called with \code{option="median"}, which substitutes the NAs with median values. \begin{example} # Impute the missing values with na_mean using option median na_mean(x, option="median") \end{example} [1]~~~~1.0~~~~2.0~~~~3.0~~~~4.0~~~~5.0~~~~6.0~~~~7.0 ~~~~8.0~~~~\textbf{5.5~~~~5.5}~~~~11.0~~~~12.0 \noindent\\ While \code{na\_interpolation} and all other imputation functions are used the same way, the results produced may be different. As can be seen below, for this series linear interpolation gives more reasonable results. \begin{example} # Impute the missing values with na_interpolation na_interpolation(x) \end{example} [1]~~~~1~~~~2~~~~3~~~~4~~~~5~~~~6~~~~7 ~~~~8~~~~\textbf{9~~~~10}~~~~11~~~~12 \noindent\\ For longer and more complex time series (with trend and seasonality) than in this example it is always a good idea to try \code{na\_kalman} and \code{na\_seadec}, since these functions very often produce the best results. These functions are called the same easy way as all other imputation functions. \noindent Here is a usage example for the \code{na\_kalman} function applied on the \code{tsAirgap} (described in ~\ref{subsec:Datasets}) time series. As can be seen in Figure~\ref{fig:imputations2}, \code{na\_kalman} provides really good results for this series, which contains a strong seasonality and a strong trend. \begin{example} # Impute the missing values with na_kalman # (tsAirgap is an example time series provided by the imputeTS package) imp <- na_kalman(tsAirgap) #Code for visualization ggplot_na_imputations(tsAirgap, imp, tsAirgapComplete) \end{example} \begin{figure} [h] \centering \includegraphics[width=5.7in]{Figures.d/imputations2.png} \caption{Results of imputation with na\_kalman compared to real values} \label{fig:imputations2} \end{figure} \noindent \FloatBarrier \subsection{ggplot\_na\_distribution} This function visualizes the distribution of missing values within a time series. Therefore, the time series is plotted and whenever a value is NA the background is colored differently. This gives a nice overview, where in the time series most of the missing values occur. An example usage of the function can be seen below (for the plot see Figure~\ref{fig:distribution}). \begin{example} # Example Code 'ggplot_na_distribution' # (tsAirgap is an example time series provided by the imputeTS package) # Visualize the missing values in this time series ggplot_na_distribution(tsAirgap) \end{example} \begin{figure} [h] \centering \includegraphics[width=5.7in]{Figures.d/distribution.png} \caption{Example for ggplot\_na\_distribution} \label{fig:distribution} \end{figure} \FloatBarrier \noindent As can be seen in Figure~\ref{fig:distribution}, in areas with missing data the background is colored red. The whole plot is pretty much self-explanatory. The plotting function itself needs no further configuration parameters, nevertheless it allows passing through of plot parameters (via ...). \FloatBarrier \noindent\\ \subsection{ggplot\_na\_distribution2} This function also visualizes the distribution of missing values within a time series. This is done as a barplot, which is especially useful if the time series would otherwise be too large to be plotted. Multiple observations for time intervals are grouped together and represented as bars. For these intervals, information about the amount of missing values are shown. An example usage of the function can be seen below (for the plot see Figure~\ref{fig:distributionBar}). \begin{example} # Example Code 'ggplot_na_distribution2' # (tsHeating is an example time series provided by the imputeTS package) # Visualize the missing values in this time series ggplot_na_distribution2(tsNH4) \end{example} \begin{figure} [h] \centering \includegraphics[width=5.7in]{Figures.d/distributionbar.png} \caption{Example for ggplot\_na\_distribution2} \label{fig:distributionBar} \end{figure} \FloatBarrier \noindent\\ As can be seen in the x-axis of Figure~\ref{fig:distributionBar}, the \code{tsHeating} series is with over 600.000 observations a very large time series. While the missing values in the \code{tsAirgap} series (144 observations) can be visualized with \code{ggplot\_na\_distribution} like in Figure~\ref{fig:distribution}, this would for sure not work out for \code{tsHeating}. There just isn't enough space for 600.000 single consecutive observations/points in the plotting area. The \code{ggplot\_na\_intervals} function solves this problem. Multiple observations are grouped together in intervals. The 'breaks' parameter in the example defines that there should be 20 intervals used. This means every interval in Figure~\ref{fig:distributionBar} represents approximately 30.000 observations. The first five intervals are completely green, which means there are no missing values present. This means from observation 1 up to observation 150.000 there are no missing values in the data. In the middle and at the end of the series there are several intervals each having around 40\% of missing data. This means in these intervals 12.000 out of 30.000 observation are NA. All in all, the plot is able to give a nice but rough overview about the NA distribution in very large time series. \noindent\\ \subsection{ggplot\_na\_gapsize} This plotting function can be used to visualize how often different NA gaps (NAs in a row) occur in a time series. The function shows this information as a ranking. This ranking can be ordered by total NAs gap sizes account for (number occurrence gap size * gap length) or just by the number of occurrences of gap sizes. In the end the results can be read like this: In time series x, 3 NAs in a row occur most often with 20 occurrences, 6 NAs in a row occur 2nd most with 5 occurrences, 2 NAs in a row occur 3rd most with 3 occurrences. An example usage of the function can be seen below(for the plot see Figure~\ref{fig:gapsize}). \begin{example} # Example Code 'ggplot_na_gapsize' # (tsNH4 is an example time series provided by the imputeTS package) # Visualize the top gap sizes / NAs in a row ggplot_na_gapsize(tsNH4) \end{example} \begin{figure} [h] \centering \includegraphics[width=5.7in]{Figures.d/gapsize.png} \caption{Example for ggplot\_na\_gapsize} \label{fig:gapsize} \end{figure} \FloatBarrier \noindent The example plot (Figure~\ref{fig:gapsize}) reads the following: In the time series \code{tsNH4} gap size 157 occurs just 1 time, but makes up for most NAs of all gap sizes (157 NAs). A gap size of 91 (91 NAs in a row) also occurs just once, but makes up for 2nd most NAs (91 NAs). A gap size of 42 occurs two times in the time series, which leads to 3rd most overall (84 NAs). A gap size of one (no other NAs before or behind the NA) occurs 68 times, which makes this 4th in overall NAs (68 NAs). \noindent\\ \subsection{ggplot\_na\_imputations} This plot can be used, to visualize the imputed values for a time series. Therefore, the imputed values (filled NA gaps) are shown in a different color than the other values. The function is used as below and Figure~\ref{fig:imputations} shows the output. \begin{example} # Example Code 'ggplot_na_imputations' # (tsAirgap is an example time series provided by the imputeTS package) # Step 1: Perform imputation for x using na_mean tsAirgap.imp <- na_mean(tsAirgap) # Step 2: Visualize the imputed values in the time series ggplot_na_imputations(tsAirgap, tsAirgap.imp) \end{example} \noindent The visual inspection of Figure~\ref{fig:imputations} indicates, that the imputed values (red) do not fit very well in the \code{tsAirgap} series. This is caused by \code{na\_mean} being used for imputation of a series with a strong trend. The plotting function enables users to quickly detect such problems in the imputation results. If the ground truth is known for the imputed values, this information can also be added to the plot. The plotting function itself needs no further configuration parameters. Nevertheless, it allows passing through of plot parameters (via ...). \noindent \begin{figure} [h] \centering \includegraphics[width=5.7in]{Figures.d/imputations.png} \caption{Example for ggplot\_na\_imputations} \label{fig:imputations} \end{figure} \FloatBarrier \subsection{statsNA} The \code{statsNA} function prints summary stats about the distribution of missing values in univariate time series. Here is a short explanation about the information it gives: \begin{itemize} \item Length of time series\\ Number of observations in the time series (including NAs) \item Number of Missing Values\\ Number of missing values in the time series \item Percentage of Missing Values Percentage of missing values in the time series \item Stats for Bins\\ Number/percentage of missing values for the split into bins \item Longest NA gap\\ Longest series of consecutive missing values (NAs in a row) in the time series \item Most frequent gap size\\ Most frequent occurring series of missing values in the time series \item Gap size accounting for most NAs\\ he series of consecutive missing values that accounts for most missing values overall in the time series \item Overview NA series\\ Overview about how often each series of consecutive missing values occurs. Series occurring 0 times are skipped \end{itemize} \noindent The function is used as below and Figure~\ref{fig:statsNA} shows the output. \noindent\\ \begin{example} # Example Code 'statsNA' # (tsNH4 is an example time series provided by the imputeTS package) # Print stats about the missing data statsNA(tsNH4) \end{example} \begin{figure} [h] \centering \includegraphics[width=5in]{Figures.d/statsna.png} \caption{Excerpt of statsNA output} \label{fig:statsNA} \end{figure} \FloatBarrier \subsection{Datasets} Using the datasets is self-explanatory, after the package is loaded they are directly available and usable under their name. No call of data() is needed. For every dataset there is always a complete version (without NAs) and an incomplete version (containing NAs) available. \begin{example} # Example Code to use tsAirgap dataset library("imputeTS") tsAirgap \end{example} \begin{figure} [h] \centering \includegraphics[width=4in]{Figures.d/tsairgap.png} \caption{Example tsAirgap time series} \label{fig:tsAirgap} \end{figure} \FloatBarrier %================================================================================ \section{Conclusions} \label{sec:Conclusions} Missing data is a very common problem for all kinds of data. However, in case of univariate time series most standard algorithms and existing functions within R packages cannot be applied. \noindent\\ This paper presented the \pkg{imputeTS} package that provides a collection of algorithms and tools especially tailored to this task. Using example time series, we illustrated the ease of use and the advantages of the provided functions. Simple algorithms as well as more complicated ones can be applied in the same simple and user-friendly manner. \noindent\\\\ The functionality provided makes the \pkg{imputeTS} package a good choice for preprocessing of time series ahead of further analysis steps that require complete absence of missing values. \noindent\\\\ Future research and development plans for forthcoming versions of the package include adding additional time series algorithm options to choose from. %================================================================================ \section{Acknowledgment} \label{sec:Acknowlegment} Parts of this work have been developed in the project '\textit{IMProvT: Intelligente Messverfahren zur Prozessoptimierung von Trinkwasserbereitstellung und -verteilung}' (reference number: 03ET1387A). Kindly supported by the Federal Ministry of Economic Affairs and Energy of the Federal Republic of Germany. \begin{figure} [h] \centering \includegraphics[width=2.5in]{Figures.d/sponsorlogo.jpg} \label{fig:sponsor} \end{figure} \FloatBarrier %================================================================================ \begin{thebibliography}{21} \providecommand{\natexlab}[1]{#1} \providecommand{\url}[1]{\texttt{#1}} \expandafter\ifx\csname urlstyle\endcsname\relax \providecommand{\doi}[1]{doi: #1}\else \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi \bibitem[Bar-Joseph et~al.(2003)Bar-Joseph, Gerber, Gifford, Jaakkola, and Simon]{bar2003continuous} Z.~Bar-Joseph, G.~K. Gerber, D.~K. Gifford, T.~S. Jaakkola, and I.~Simon. \newblock Continuous representations of time-series gene expression data. \newblock \emph{Journal of Computational Biology}, 10\penalty0 (3-4):\penalty0 341--356, 2003. \bibitem[Billinton et~al.(1996)Billinton, Chen, and Ghajar]{billinton1996time} R.~Billinton, H.~Chen, and R.~Ghajar. \newblock Time-series models for reliability evaluation of power systems including wind energy. \newblock \emph{Microelectronics Reliability}, 36\penalty0 (9):\penalty0 1253--1261, 1996. \bibitem[Box et~al.(2015)Box, Jenkins, Reinsel, and Ljung]{Box76a} G.~E. Box, G.~M. Jenkins, G.~C. Reinsel, and G.~M. Ljung. \newblock \emph{Time Series Analysis: Forecasting and Control}. \newblock John Wiley \& Sons, 2015. \bibitem[Dempster et~al.(1977)Dempster, Laird, and Rubin]{dempster1977maximum} A.~P. Dempster, N.~M. Laird, and D.~B. Rubin. \newblock Maximum likelihood from incomplete data via the {EM} algorithm. \newblock \emph{Journal of the royal statistical society. Series B (methodological)}, pages 1--38, 1977. \bibitem[Ford(1983)]{ford1983overview} B.~L. Ford. \newblock {An Overview of Hot-Deck Procedures}. \newblock \emph{Incomplete data in sample surveys}, 2\penalty0 (Part IV):\penalty0 185--207, 1983. \bibitem[Gottman(1981)]{gottman1981time} J.~M. Gottman. \newblock \emph{Time-series analysis: A comprehensive introduction for social scientists}, volume 400. \newblock Cambridge University Press Cambridge, 1981. \bibitem[Honaker et~al.(2011)Honaker, King, and Blackwell]{AMELIA} J.~Honaker, G.~King, and M.~Blackwell. \newblock {Amelia II: A Program for Missing Data}. \newblock \emph{Journal of Statistical Software}, 45\penalty0 (7):\penalty0 1--47, 2011. \newblock URL \url{http://www.jstatsoft.org/v45/i07/}. \bibitem[Hyndman(2016)]{forecast} R.~J. Hyndman. \newblock \emph{{forecast}: Forecasting functions for time series and linear models}, 2016. \newblock URL \url{http://github.com/robjhyndman/forecast}. \newblock R package version 7.3. \bibitem[Josse and Husson(2016)]{missMDA} J.~Josse and F.~Husson. \newblock {missMDA: A Package for Handling Missing Values in Multivariate Data Analysis}. \newblock \emph{Journal of Statistical Software}, 70\penalty0 (1):\penalty0 1--31, 2016. \newblock \doi{10.18637/jss.v070.i01}. \bibitem[Kowarik and Templ(2016)]{VIM} A.~Kowarik and M.~Templ. \newblock Imputation with the {R} package {VIM}. \newblock \emph{Journal of Statistical Software}, 74\penalty0 (7):\penalty0 1--16, 2016. \newblock \doi{10.18637/jss.v074.i07}. \bibitem[Moritz(2016{\natexlab{a}})]{imputeTS} S.~Moritz. \newblock \emph{{imputeTS: Time Series Missing Value Imputation}}, 2016{\natexlab{a}}. \newblock URL \url{http://CRAN.R-project.org/package=imputeTS}. \newblock R package version 1.7. \bibitem[Moritz(2016{\natexlab{b}})]{imputeTSmanual} S.~Moritz. \newblock \emph{package imputeTS}, 2016{\natexlab{b}}. \newblock URL \url{http://cran.r-project.org/web/packages/imputeTS/imputeTS.pdf}. \newblock R package version 1.7. \bibitem[{Moritz} et~al.(2015){Moritz}, {Sard{\'a}}, {Bartz-Beielstein}, {Zaefferer}, and {Stork}]{Moritz15a} S.~{Moritz}, A.~{Sard{\'a}}, T.~{Bartz-Beielstein}, M.~{Zaefferer}, and J.~{Stork}. \newblock {Comparison of different Methods for Univariate Time Series Imputation in R}. \newblock \emph{ArXiv e-prints}, Oct. 2015. \bibitem[Pebesma(2012)]{spacetime} E.~Pebesma. \newblock {spacetime: Spatio-Temporal Data in R}. \newblock \emph{Journal of Statistical Software}, 51\penalty0 (7):\penalty0 1--30, 2012. \newblock URL \url{http://www.jstatsoft.org/v51/i07/}. \bibitem[{Rmetrics Core Team} et~al.(2015){Rmetrics Core Team}, Wuertz, Setz, and Chalabi]{timeSeries} {Rmetrics Core Team}, D.~Wuertz, T.~Setz, and Y.~Chalabi. \newblock \emph{{timeSeries: Rmetrics - Financial Time Series Objects}}, 2015. \newblock URL \url{https://CRAN.R-project.org/package=timeSeries}. \newblock R package version 3022.101.2. \bibitem[Rubin(1987)]{rubin1987multiple} D.~B. Rubin. \newblock \emph{Multiple imputation for nonresponse in surveys}. \newblock New York: Wiley, 1987. \bibitem[Ryan and Ulrich(2014)]{xts} J.~A. Ryan and J.~M. Ulrich. \newblock \emph{{xts: eXtensible Time Series}}, 2014. \newblock URL \url{https://CRAN.R-project.org/package=xts}. \newblock R package version 0.9-7. \bibitem[Taylor(2007)]{taylor2007modelling} S.~J. Taylor. \newblock \emph{Modelling financial time series (second edition)}. \newblock World Scientific Publishing, 2007. \bibitem[Vacek and Ashikaga(1980)]{vacek1980examination} P.~Vacek and T.~Ashikaga. \newblock An examination of the nearest neighbor rule for imputing missing values. \newblock \emph{Proc. Statist. Computing Sect., Amer. Statist. Ass}, pages 326--331, 1980. \bibitem[{van Buuren} and Groothuis-Oudshoorn(2011)]{mice} S.~{van Buuren} and K.~Groothuis-Oudshoorn. \newblock {mice: Multivariate Imputation by Chained Equations in R}. \newblock \emph{Journal of Statistical Software}, 45\penalty0 (3):\penalty0 1--67, 2011. \newblock URL \url{http://www.jstatsoft.org/v45/i03/}. \bibitem[Zeileis and Grothendieck(2005)]{zoo} A.~Zeileis and G.~Grothendieck. \newblock {zoo: S3 Infrastructure for Regular and Irregular Time Series}. \newblock \emph{Journal of Statistical Software}, 14\penalty0 (6):\penalty0 1--27, 2005. \newblock URL \url{http://www.jstatsoft.org/v14/i06/}. \end{thebibliography} %================================================================================ \address{Steffen Moritz\\ Cologne University of Applied Sciences\\ Cologne, Germany\\} \email{steffen.moritz10@gmail.com} \address{Thomas Bartz-Beielstein\\ Cologne University of Applied Sciences\\ Cologne, Germany\\} \email{bartz.beielstein@th-koeln.de} \end{article} \end{document}