Repository: easystats/datawizard Branch: main Commit: 89daeba7d094 Files: 321 Total size: 1.8 MB Directory structure: gitextract_nyyro61y/ ├── .Rbuildignore ├── .dev/ │ ├── _BENCHMARK_RESHAPE.R │ ├── html5.R │ ├── revdepcheck.R │ ├── test-value_at.R │ └── value_at.R ├── .git-blame-ignore-revs ├── .github/ │ ├── .gitignore │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── FUNDING.yml │ ├── SUPPORT.md │ ├── dependabot.yaml │ └── workflows/ │ ├── R-CMD-check-hard.yaml │ ├── R-CMD-check.yaml │ ├── check-all-examples.yaml │ ├── check-link-rot.yaml │ ├── check-random-test-order.yaml │ ├── check-readme.yaml │ ├── check-spelling.yaml │ ├── check-styling.yaml │ ├── check-test-warnings.yaml │ ├── check-vignette-warnings.yaml │ ├── html-5-check.yaml │ ├── lint-changed-files.yaml │ ├── lint.yaml │ ├── pkgdown-no-suggests.yaml │ ├── pkgdown.yaml │ ├── test-coverage-examples.yaml │ ├── test-coverage.yaml │ └── update-to-latest-easystats.yaml ├── .gitignore ├── .lintr ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R/ │ ├── adjust.R │ ├── assign_labels.R │ ├── categorize.R │ ├── center.R │ ├── contrs.R │ ├── convert_na_to.R │ ├── convert_to_na.R │ ├── data.R │ ├── data_addprefix.R │ ├── data_arrange.R │ ├── data_codebook.R │ ├── data_duplicated.R │ ├── data_extract.R │ ├── data_group.R │ ├── data_match.R │ ├── data_merge.R │ ├── data_modify.R │ ├── data_partition.R │ ├── data_peek.R │ ├── data_read.R │ ├── data_relocate.R │ ├── data_remove.R │ ├── data_rename.R │ ├── data_replicate.R │ ├── data_rescale.R │ ├── data_restoretype.R │ ├── data_reverse.R │ ├── data_rotate.R │ ├── data_seek.R │ ├── data_select.R │ ├── data_separate.R │ ├── data_summary.R │ ├── data_tabulate.R │ ├── data_to_long.R │ ├── data_to_wide.R │ ├── data_unique.R │ ├── data_unite.R │ ├── data_write.R │ ├── data_xtabulate.R │ ├── datawizard-package.R │ ├── demean.R │ ├── describe_distribution.R │ ├── descriptives.R │ ├── extract_column_names.R │ ├── format.R │ ├── labels_to_levels.R │ ├── makepredictcall.R │ ├── mean_sd.R │ ├── means_by_group.R │ ├── normalize.R │ ├── ranktransform.R │ ├── recode_into.R │ ├── recode_values.R │ ├── remove_empty.R │ ├── replace_nan_inf.R │ ├── rescale_weights.R │ ├── reshape_ci.R │ ├── row_count.R │ ├── row_means.R │ ├── select_nse.R │ ├── skewness_kurtosis.R │ ├── slide.R │ ├── smoothness.R │ ├── standardize.R │ ├── standardize.models.R │ ├── text_format.R │ ├── to_factor.R │ ├── to_numeric.R │ ├── unnormalize.R │ ├── unstandardize.R │ ├── utils-cols.R │ ├── utils-rows.R │ ├── utils.R │ ├── utils_labels.R │ ├── utils_standardize_center.R │ ├── visualisation_recipe.R │ ├── weighted_mean_median_sd_mad.R │ └── winsorize.R ├── README.Rmd ├── README.md ├── air.toml ├── cran-comments.md ├── data/ │ ├── efc.RData │ └── nhanes_sample.RData ├── datawizard.Rproj ├── datawizard.code-workspace ├── inst/ │ ├── CITATION │ └── WORDLIST ├── man/ │ ├── adjust.Rd │ ├── as.prop.table.Rd │ ├── assign_labels.Rd │ ├── categorize.Rd │ ├── center.Rd │ ├── coef_var.Rd │ ├── coerce_to_numeric.Rd │ ├── colnames.Rd │ ├── contr.deviation.Rd │ ├── convert_na_to.Rd │ ├── convert_to_na.Rd │ ├── data_arrange.Rd │ ├── data_codebook.Rd │ ├── data_duplicated.Rd │ ├── data_extract.Rd │ ├── data_group.Rd │ ├── data_match.Rd │ ├── data_merge.Rd │ ├── data_modify.Rd │ ├── data_partition.Rd │ ├── data_peek.Rd │ ├── data_prefix_suffix.Rd │ ├── data_read.Rd │ ├── data_relocate.Rd │ ├── data_rename.Rd │ ├── data_replicate.Rd │ ├── data_restoretype.Rd │ ├── data_rotate.Rd │ ├── data_seek.Rd │ ├── data_separate.Rd │ ├── data_summary.Rd │ ├── data_tabulate.Rd │ ├── data_to_long.Rd │ ├── data_to_wide.Rd │ ├── data_unique.Rd │ ├── data_unite.Rd │ ├── datawizard-package.Rd │ ├── demean.Rd │ ├── describe_distribution.Rd │ ├── distribution_mode.Rd │ ├── efc.Rd │ ├── extract_column_names.Rd │ ├── labels_to_levels.Rd │ ├── makepredictcall.dw_transformer.Rd │ ├── mean_sd.Rd │ ├── means_by_group.Rd │ ├── nhanes_sample.Rd │ ├── normalize.Rd │ ├── ranktransform.Rd │ ├── recode_into.Rd │ ├── recode_values.Rd │ ├── reexports.Rd │ ├── remove_empty.Rd │ ├── replace_nan_inf.Rd │ ├── rescale.Rd │ ├── rescale_weights.Rd │ ├── reshape_ci.Rd │ ├── reverse.Rd │ ├── row_count.Rd │ ├── row_means.Rd │ ├── rownames.Rd │ ├── skewness.Rd │ ├── slide.Rd │ ├── smoothness.Rd │ ├── standardize.Rd │ ├── standardize.default.Rd │ ├── text_format.Rd │ ├── to_factor.Rd │ ├── to_numeric.Rd │ ├── visualisation_recipe.Rd │ ├── weighted_mean.Rd │ └── winsorize.Rd ├── paper/ │ └── JOSS_files/ │ ├── apa.csl │ ├── paper.Rmd │ ├── paper.bib │ ├── paper.log │ └── paper.md ├── pkgdown/ │ └── _pkgdown.yaml ├── tests/ │ ├── testthat/ │ │ ├── _snaps/ │ │ │ ├── categorize.md │ │ │ ├── contr.deviation.md │ │ │ ├── data_codebook.md │ │ │ ├── data_modify.md │ │ │ ├── data_partition.md │ │ │ ├── data_peek.md │ │ │ ├── data_read.md │ │ │ ├── data_rescale.md │ │ │ ├── data_seek.md │ │ │ ├── data_separate.md │ │ │ ├── data_summary.md │ │ │ ├── data_tabulate.md │ │ │ ├── data_to_factor.md │ │ │ ├── data_to_long.md │ │ │ ├── data_to_numeric.md │ │ │ ├── demean.md │ │ │ ├── describe_distribution.md │ │ │ ├── empty-dataframe.md │ │ │ ├── means_by_group.md │ │ │ ├── normalize.md │ │ │ ├── print.dw_transformer.md │ │ │ ├── ranktransform.md │ │ │ ├── rescale_weights.md │ │ │ ├── reshape_ci.md │ │ │ ├── skewness-kurtosis.md │ │ │ ├── smoothness.md │ │ │ ├── text_format.md │ │ │ ├── windows/ │ │ │ │ └── means_by_group.md │ │ │ └── winsorization.md │ │ ├── helper-state.R │ │ ├── helper.R │ │ ├── test-adjust.R │ │ ├── test-assign_labels.R │ │ ├── test-attributes-grouped-df.R │ │ ├── test-attributes.R │ │ ├── test-categorize.R │ │ ├── test-center.R │ │ ├── test-coef_var.R │ │ ├── test-contr.deviation.R │ │ ├── test-convert_na_to.R │ │ ├── test-convert_to_na.R │ │ ├── test-data_addprefix.R │ │ ├── test-data_arrange.R │ │ ├── test-data_codebook.R │ │ ├── test-data_duplicated.R │ │ ├── test-data_extract.R │ │ ├── test-data_group.R │ │ ├── test-data_match.R │ │ ├── test-data_merge.R │ │ ├── test-data_modify.R │ │ ├── test-data_partition.R │ │ ├── test-data_peek.R │ │ ├── test-data_read.R │ │ ├── test-data_recode.R │ │ ├── test-data_relocate.R │ │ ├── test-data_remove.R │ │ ├── test-data_rename.R │ │ ├── test-data_reorder.R │ │ ├── test-data_replicate.R │ │ ├── test-data_rescale.R │ │ ├── test-data_restoretype.R │ │ ├── test-data_reverse.R │ │ ├── test-data_rotate.R │ │ ├── test-data_seek.R │ │ ├── test-data_select.R │ │ ├── test-data_separate.R │ │ ├── test-data_shift.R │ │ ├── test-data_summary.R │ │ ├── test-data_tabulate.R │ │ ├── test-data_to_factor.R │ │ ├── test-data_to_long.R │ │ ├── test-data_to_numeric.R │ │ ├── test-data_to_wide.R │ │ ├── test-data_unique.R │ │ ├── test-data_unite.R │ │ ├── test-data_write.R │ │ ├── test-demean.R │ │ ├── test-describe_distribution.R │ │ ├── test-distributions.R │ │ ├── test-empty-dataframe.R │ │ ├── test-extract_column_names.R │ │ ├── test-labelled_data.R │ │ ├── test-labels_to_levels.R │ │ ├── test-makepredictcall.R │ │ ├── test-mean_sd.R │ │ ├── test-means_by_group.R │ │ ├── test-normalize.R │ │ ├── test-print.dw_transformer.R │ │ ├── test-ranktransform.R │ │ ├── test-recode_into.R │ │ ├── test-replace_nan_inf.R │ │ ├── test-rescale_weights.R │ │ ├── test-reshape_ci.R │ │ ├── test-row_count.R │ │ ├── test-row_means.R │ │ ├── test-select_nse.R │ │ ├── test-skewness-kurtosis.R │ │ ├── test-smoothness.R │ │ ├── test-standardize-data.R │ │ ├── test-standardize_datagrid.R │ │ ├── test-standardize_models.R │ │ ├── test-std_center.R │ │ ├── test-std_center_scale_args.R │ │ ├── test-text_format.R │ │ ├── test-unnormalize.R │ │ ├── test-utils.R │ │ ├── test-utils_cols.R │ │ ├── test-utils_rows.R │ │ ├── test-weighted-stats.R │ │ └── test-winsorization.R │ └── testthat.R └── vignettes/ ├── .gitignore ├── bibliography.bib ├── overview_of_vignettes.Rmd ├── selection_syntax.Rmd ├── standardize_data.Rmd └── tidyverse_translation.Rmd ================================================ FILE CONTENTS ================================================ ================================================ FILE: .Rbuildignore ================================================ ^\cache$ ^codemeta\.json$ ^Meta$ ^doc$ ^.*\.Rproj$ ^\.Rproj\.user$ ^README\.Rmd$ ^Rplots.pdf$ ^README-.*\.png$ ^CONDUCT\.md$ ^SECURITY\.md$ ^cran-comments\.md$ ^CODE_OF_CONDUCT\.md$ ^SUPPORT\.md$ ^\.github$ ^NEWS$ ^docs$ ^revdep$ publication/* ^codecov\.yml$ ^\.coveralls\.yml$ ^\.travis\.yml$ ^_pkgdown\.yml$ ^_pkgdown\.yaml$ ^appveyor\.yml$ ^.gitlab-ci\.yml$ ^data-raw$ ^pkgdown$ ^\.httr-oauth$ ^CRAN-RELEASE$ tests\^spelling ^LICENSE\.md$ ^\.lintr$ \.code-workspace$ ^\.circleci$ ^tests/manual$ ^revdep$ ^\.covrignore$ ^\.github/ISSUE_TEMPLATE$ ^paper.*$ references.bib ^API$ ^\.pre-commit-config\.yaml$ ^\.github/workflows/R\.yaml$ ^\.github/workflows/pr-commands\.yaml$ ^hextools/. ^WIP/. ^CRAN-SUBMISSION$ docs ^.dev$ ^vignettes/s. ^vignettes/t. ^[\.]?air\.toml$ ^\.vscode$ ^\.git-blame-ignore-revs$ ================================================ FILE: .dev/_BENCHMARK_RESHAPE.R ================================================ library(tidyr) library(dplyr) library(datawizard) ### DATA_TO_LONG ========================================== # SLOW (5M rows) wide_data <- data.frame(replicate(5, rnorm(10))) tmp <- list() for (i in 1:500000) { tmp[[i]] <- wide_data } tmp <- data.table::rbindlist(tmp) |> as_tibble() ex1_l <- bench::mark( old = old_data_to_long(tmp), new = data_to_long(tmp), tidyr = pivot_longer(tmp, cols = everything()), iterations = 10 ) ex2_l <- bench::mark( old = relig_income %>% old_data_to_long(-"religion", names_to = "income", values_to = "count"), new = relig_income %>% data_to_long(-"religion", names_to = "income", values_to = "count"), tidyr = relig_income %>% pivot_longer(!religion, names_to = "income", values_to = "count"), iterations = 100 ) ex3_l <- bench::mark( old = billboard %>% old_data_to_long( cols = starts_with("wk"), names_to = "week", values_to = "rank" ), new = billboard %>% data_to_long( cols = starts_with("wk"), names_to = "week", values_to = "rank" ), tidyr = billboard %>% pivot_longer( cols = starts_with("wk"), names_to = "week", values_to = "rank" ), iterations = 50 ) ex4_l <- bench::mark( old = who |> old_data_to_long( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_sep = "_", values_to = "count" ), new = who |> data_to_long( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_sep = "_", values_to = "count" ), tidyr = who |> pivot_longer( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_sep = "_", values_to = "count" ), iterations = 10 ) ex5_l <- bench::mark( old = who |> old_data_to_long( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ), new = who |> data_to_long( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ), tidyr = who |> pivot_longer( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ), iterations = 10 ) ### DATA_TO_WIDE ========================================== ex1_w <- bench::mark( old = fish_encounters %>% old_data_to_wide( names_from = "station", values_from = "seen", values_fill = 0 ), new = fish_encounters %>% data_to_wide( names_from = "station", values_from = "seen", values_fill = 0 ), tidyr = fish_encounters %>% pivot_wider( names_from = "station", values_from = "seen", values_fill = 0 ), iterations = 100 ) production <- expand_grid( product = letters, country = paste0(letters, "I"), year = 2000:2025 ) %>% mutate(production = rnorm(nrow(.))) ex2_w <- bench::mark( old = production %>% old_data_to_wide( names_from = c("product", "country"), values_from = "production" ), new = production %>% data_to_wide( names_from = c("product", "country"), values_from = "production" ), tidyr = production %>% pivot_wider( names_from = c(product, country), values_from = production ), iterations = 10 ) ex3_w <- bench::mark( old = production %>% old_data_to_wide( names_from = c("product", "country"), values_from = "production", names_glue = "prod_{product}_{country}" ), new = production %>% data_to_wide( names_from = c("product", "country"), values_from = "production", names_glue = "prod_{product}_{country}" ), tidyr = production %>% pivot_wider( names_from = c(product, country), values_from = production, names_glue = "prod_{product}_{country}" ), iterations = 10 ) tmp <- list() for (i in 1:1000) { tmp[[i]] <- us_rent_income } tmp <- data.table::rbindlist(tmp) |> as_tibble() tmp$GEOID <- rep(1:52000, each = 2) tmp$NAME <- as.character(rep(1:52000, each = 2)) ex4_w <- bench::mark( old = tmp %>% old_data_to_wide( names_from = "variable", values_from = c("estimate", "moe") ), new = tmp %>% data_to_wide( names_from = "variable", values_from = c("estimate", "moe") ), tidyr = tmp %>% pivot_wider( names_from = "variable", values_from = c("estimate", "moe") ), iterations = 10 ) # SLOW (1M rows) ============ set.seed(123) contacts <- tibble( id = rep(1:500000, each = 2), field = rep(c("a", "b"), 500000), value = sample(letters, 1000000, replace = TRUE) ) ex5_w <- bench::mark( old = contacts %>% old_data_to_wide(names_from = "field", values_from = "value"), new = contacts %>% data_to_wide(names_from = "field", values_from = "value"), tidyr = contacts %>% tidyr::pivot_wider(names_from = field, values_from = value), iterations = 1 ) # SLOWER (10M rows) ============ set.seed(123) contacts <- tibble( id = rep(1:5000000, each = 2), field = rep(c("a", "b"), 5000000), value = sample(letters, 10000000, replace = TRUE) ) ex6_w <- bench::mark( old = contacts %>% old_data_to_wide(names_from = "field", values_from = "value"), new = contacts %>% data_to_wide(names_from = "field", values_from = "value"), tidyr = contacts %>% tidyr::pivot_wider(names_from = field, values_from = value), iterations = 1 ) reprex:::prex({ ### DATA_TO_LONG ========================================== ex1_l ex2_l ex3_l ex4_l ex5_l ### DATA_TO_WIDE ========================================== ex1_w ex2_w ex3_w ex4_w ex5_w ex6_w }) ================================================ FILE: .dev/html5.R ================================================ Sys.setenv("_R_CHECK_RD_VALIDATE_RD2HTML_" = "true") Sys.setenv("_R_CHECK_CRAN_INCOMING_REMOTE_" = "false") Sys.setenv("_R_CHECK_CRAN_INCOMING_" = "false") rcmdcheck::rcmdcheck( args = c("--as-cran", "--no-codoc", "--no-examples", "--no-tests", "--no-vignettes", "--no-build-vignettes", "--ignore-vignettes", "--no-install"), build_args = c("--no-build-vignettes"), error_on = "note" ) ================================================ FILE: .dev/revdepcheck.R ================================================ library(revdepcheck) revdep_check(num_workers = 4) revdep_report() revdep_reset() ================================================ FILE: .dev/test-value_at.R ================================================ test_that("value_at", { data(efc, package = "datawizard") expect_equal(value_at(efc$e42dep, 5), 4, ignore_attr = TRUE) expect_equal(value_at(efc$c12hour, 4), NA_real_, ignore_attr = TRUE) expect_equal(value_at(efc$c12hour, 4, remove_na = TRUE), 168, ignore_attr = TRUE) expect_equal(value_at(efc$c12hour, 5:7), efc$c12hour[5:7], ignore_attr = TRUE) expect_equal(value_at(efc$e42dep, 123456, default = 55), 55, ignore_attr = TRUE) expect_null(value_at(efc$e42dep, 123456)) expect_null(value_at(efc$e42dep, NULL)) expect_error(value_at(efc$e42dep, NA), regex = "`position` can't") expect_error(value_at(efc$e42dep, c(3, NA)), regex = "`position` can't") }) ================================================ FILE: .dev/value_at.R ================================================ #' @title Find the value(s) at a specific position in a variable #' @name value_at #' #' @description This function can be used to extract one or more values at a #' specific position in a variable. #' #' @param x A vector or factor. #' @param position An integer or a vector of integers, indicating the position(s) #' of the value(s) to be returned. Negative values are counted from the end of #' the vector. If `NA`, an error is thrown. #' @param remove_na Logical, if `TRUE`, missing values are removed before #' computing the position. If `FALSE`, missing values are included in the #' computation. #' @param default The value to be returned if the position is out of range. #' #' @seealso `data_summary()` to use `value_at()` inside a `data_summary()` call. #' #' @return A vector with the value(s) at the specified position(s). #' #' @examples #' data(mtcars) #' # 5th value #' value_at(mtcars$mpg, 5) #' # last value #' value_at(mtcars$mpg, -1) #' # out of range, return default #' value_at(mtcars$mpg, 150) #' # return 2nd and fifth value #' value_at(mtcars$mpg, c(2, 5)) #' @export value_at <- function(x, position = 1, default = NULL, remove_na = FALSE) { if (remove_na) { x <- x[!is.na(x)] } n <- length(x) unlist(lapply(position, .values_at, x = x, n = n, default = default), use.names = FALSE) } # helper ---- .values_at <- function(x, position, n, default) { if (is.na(position)) { insight::format_error("`position` can't be `NA`.") } if (position < 0L) { position <- position + n + 1 } if (position <= 0 || position > n) { return(default) } x[position] } ================================================ FILE: .git-blame-ignore-revs ================================================ # Air formatting 5bd245e0bc12d2eecbcfa480a231b6df3ab7d684 ================================================ FILE: .github/.gitignore ================================================ *.html ================================================ FILE: .github/CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: * Demonstrating empathy and kindness toward other people * Being respectful of differing opinions, viewpoints, and experiences * Giving and gracefully accepting constructive feedback * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience * Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: * The use of sexualized language or imagery, and sexual attention or advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at patilindrajeet.science@gmail.com. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at . Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion]. For answers to common questions about this code of conduct, see the FAQ at . Translations are available at . [homepage]: https://www.contributor-covenant.org ================================================ FILE: .github/CONTRIBUTING.md ================================================ # Contributing to datawizard This outlines how to propose a change to **datawizard**. ## Fixing typos Small typos or grammatical errors in documentation may be edited directly using the GitHub web interface, so long as the changes are made in the _source_ file. If you want to fix typos in the documentation, please edit the related `.R` file in the `R/` folder. Do _not_ edit an `.Rd` file in `man/`. ## Filing an issue The easiest way to propose a change or new feature is to file an issue. If you've found a bug, you may also create an associated issue. If possible, try to illustrate your proposal or the bug with a minimal [reproducible example](https://www.tidyverse.org/help/#reprex). ## Pull requests * Please create a Git branch for each pull request (PR). * Your contributed code should roughly follow the [R style guide](http://style.tidyverse.org), but in particular our [**easystats convention of code-style**](https://github.com/easystats/easystats#convention-of-code-style). * datawizard uses [roxygen2](https://cran.r-project.org/package=roxygen2), with [Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html), for documentation. * datawizard uses [testthat](https://cran.r-project.org/package=testthat). Adding tests to the PR makes it easier for me to merge your PR into the code base. * If your PR is a user-visible change, you may add a bullet to the top of `NEWS.md` describing the changes made. You may optionally add your GitHub username, and links to relevant issue(s)/PR(s). ## Code of Conduct Please note that this project is released with a [Contributor Code of Conduct](https://easystats.github.io/datawizard/CODE_OF_CONDUCT.html). By participating in this project you agree to abide by its terms. ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: easystats ================================================ FILE: .github/SUPPORT.md ================================================ # Getting help with `{datawizard}` Thanks for using `{datawizard}`. Before filing an issue, there are a few places to explore and pieces to put together to make the process as smooth as possible. Start by making a minimal **repr**oducible **ex**ample using the [reprex](http://reprex.tidyverse.org/) package. If you haven't heard of or used reprex before, you're in for a treat! Seriously, reprex will make all of your R-question-asking endeavors easier (which is a pretty insane ROI for the five to ten minutes it'll take you to learn what it's all about). For additional reprex pointers, check out the [Get help!](https://www.tidyverse.org/help/) resource used by the tidyverse team. Armed with your reprex, the next step is to figure out where to ask: * If it's a question: start with StackOverflow. There are more people there to answer questions. * If it's a bug: you're in the right place, file an issue. * If you're not sure: let the community help you figure it out! If your problem _is_ a bug or a feature request, you can easily return here and report it. Before opening a new issue, be sure to [search issues and pull requests](https://github.com/easystats/datawizard/issues) to make sure the bug hasn't been reported and/or already fixed in the development version. By default, the search will be pre-populated with `is:issue is:open`. You can [edit the qualifiers](https://help.github.com/articles/searching-issues-and-pull-requests/) (e.g. `is:pr`, `is:closed`) as needed. For example, you'd simply remove `is:open` to search _all_ issues in the repo, open or closed. Thanks for your help! ================================================ FILE: .github/dependabot.yaml ================================================ version: 2 updates: # Keep dependencies for GitHub Actions up-to-date - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" ================================================ FILE: .github/workflows/R-CMD-check-hard.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help # # NOTE: This workflow only directly installs "hard" dependencies, i.e. Depends, # Imports, and LinkingTo dependencies. Notably, Suggests dependencies are never # installed, with the exception of testthat, knitr, and rmarkdown. The cache is # never used to avoid accidentally restoring a cache containing a suggested # dependency. on: push: branches: [main, master] pull_request: branches: [main, master] name: R-CMD-check-hard jobs: R-CMD-check-hard: uses: easystats/workflows/.github/workflows/R-CMD-check-hard.yaml@main ================================================ FILE: .github/workflows/R-CMD-check.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help # # NOTE: This workflow is overkill for most R packages and # check-standard.yaml is likely a better choice. # usethis::use_github_action("check-standard") will install it. on: push: branches: [main, master] pull_request: branches: [main, master] name: R-CMD-check jobs: R-CMD-check: uses: easystats/workflows/.github/workflows/R-CMD-check.yaml@main ================================================ FILE: .github/workflows/check-all-examples.yaml ================================================ # Make sure all examples run successfully, even the ones that are not supposed # to be run or tested on CRAN machines by default. # # The examples that fail should use # - `if (FALSE) { ... }` (if example is included only for illustrative purposes) # - `try({ ... })` (if the intent is to show the error) # # This workflow helps find such failing examples that need to be modified. on: push: branches: [main, master] pull_request: branches: [main, master] name: check-all-examples jobs: check-all-examples: uses: easystats/workflows/.github/workflows/check-all-examples.yaml@main ================================================ FILE: .github/workflows/check-link-rot.yaml ================================================ on: push: branches: [main, master] pull_request: branches: [main, master] name: check-link-rot jobs: check-link-rot: uses: easystats/workflows/.github/workflows/check-link-rot.yaml@main ================================================ FILE: .github/workflows/check-random-test-order.yaml ================================================ # Run tests in random order on: push: branches: [main, master] pull_request: branches: [main, master] name: check-random-test-order jobs: check-random-test-order: uses: easystats/workflows/.github/workflows/check-random-test-order.yaml@main ================================================ FILE: .github/workflows/check-readme.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] pull_request: branches: [main, master] name: check-readme jobs: check-readme: uses: easystats/workflows/.github/workflows/check-readme.yaml@main ================================================ FILE: .github/workflows/check-spelling.yaml ================================================ on: push: branches: [main, master] pull_request: branches: [main, master] name: check-spelling jobs: check-spelling: uses: easystats/workflows/.github/workflows/check-spelling.yaml@main ================================================ FILE: .github/workflows/check-styling.yaml ================================================ on: push: branches: [main, master] pull_request: branches: [main, master] name: check-styling jobs: check-styling: uses: easystats/workflows/.github/workflows/check-styling.yaml@main ================================================ FILE: .github/workflows/check-test-warnings.yaml ================================================ # Running tests with options(warn = 2) to fail on test warnings on: push: branches: [main, master] pull_request: branches: [main, master] name: check-test-warnings jobs: check-test-warnings: uses: easystats/workflows/.github/workflows/check-test-warnings.yaml@main ================================================ FILE: .github/workflows/check-vignette-warnings.yaml ================================================ # Running tests with options(warn = 2) to fail on test warnings on: push: branches: [main, master] pull_request: branches: [main, master] name: check-vignette-warnings jobs: check-vignette-warnings: uses: easystats/workflows/.github/workflows/check-vignette-warnings.yaml@main ================================================ FILE: .github/workflows/html-5-check.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] pull_request: branches: [main, master] name: html-5-check jobs: html-5-check: uses: easystats/workflows/.github/workflows/html-5-check.yaml@main ================================================ FILE: .github/workflows/lint-changed-files.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: pull_request: branches: [main, master] name: lint-changed-files jobs: lint-changed-files: uses: easystats/workflows/.github/workflows/lint-changed-files.yaml@main ================================================ FILE: .github/workflows/lint.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] pull_request: branches: [main, master] name: lint jobs: lint: uses: easystats/workflows/.github/workflows/lint.yaml@main ================================================ FILE: .github/workflows/pkgdown-no-suggests.yaml ================================================ on: push: branches: [main, master] pull_request: branches: [main, master] name: pkgdown-no-suggests jobs: pkgdown-no-suggests: uses: easystats/workflows/.github/workflows/pkgdown-no-suggests.yaml@main ================================================ FILE: .github/workflows/pkgdown.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] pull_request: branches: [main, master] release: types: [published] workflow_dispatch: name: pkgdown jobs: pkgdown: uses: easystats/workflows/.github/workflows/pkgdown.yaml@main ================================================ FILE: .github/workflows/test-coverage-examples.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] pull_request: branches: [main, master] name: test-coverage-examples jobs: test-coverage-examples: uses: easystats/workflows/.github/workflows/test-coverage-examples.yaml@main ================================================ FILE: .github/workflows/test-coverage.yaml ================================================ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] pull_request: branches: [main, master] name: test-coverage jobs: test-coverage: uses: easystats/workflows/.github/workflows/test-coverage.yaml@main ================================================ FILE: .github/workflows/update-to-latest-easystats.yaml ================================================ on: schedule: # Check for dependency updates once a month - cron: "0 0 1 * *" name: update-to-latest-easystats jobs: update-to-latest-easystats: uses: easystats/workflows/.github/workflows/update-to-latest-easystats.yaml@main ================================================ FILE: .gitignore ================================================ # History files .Rhistory .Rapp.history # Session Data files .RData # Example code in package build process *-Ex.R # Output files from R CMD build /*.tar.gz # Output files from R CMD check /*.Rcheck/ # RStudio files .Rproj.user/ # produced vignettes vignettes/*.html vignettes/*.pdf # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 .httr-oauth # knitr and R markdown default cache directories /*_cache/ /cache/ # Temporary files created by R markdown *.utf8.md *.knit.md # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html rsconnect/ ========================= # Operating System Files # OSX .DS_Store .AppleDouble .LSOverride # Thumbnails ._* # Files that might appear in the root of a volume .DocumentRevisions-V100 .fseventsd .Spotlight-V100 .TemporaryItems .Trashes .VolumeIcon.icns # Directories potentially created on remote AFP share .AppleDB .AppleDesktop Network Trash Folder Temporary Items .apdisk docs inst/doc CRAN-SUBMISSION ================================================ FILE: .lintr ================================================ linters: all_linters( absolute_path_linter = NULL, assignment_linter = NULL, commented_code_linter = NULL, cyclocomp_linter(25L), if_not_else_linter(exceptions = character(0L)), implicit_integer_linter = NULL, library_call_linter = NULL, line_length_linter(120L), namespace_linter = NULL, nonportable_path_linter = NULL, object_name_linter = NULL, object_length_linter(50L), object_usage_linter = NULL, todo_comment_linter = NULL, string_boundary_linter = NULL, strings_as_factors_linter = NULL, undesirable_function_linter = NULL, undesirable_operator_linter = NULL, unnecessary_concatenation_linter(allow_single_expression = FALSE), unused_import_linter = NULL ) ================================================ FILE: DESCRIPTION ================================================ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations Version: 1.3.1 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), person("Etienne", "Bacher", , "etienne.bacher@protonmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9271-5075")), person("Dominique", "Makowski", , "dom.makowski@gmail.com", role = "aut", comment = c(ORCID = "0000-0001-5375-9967")), person("Daniel", "Lüdecke", , "d.luedecke@uke.de", role = "aut", comment = c(ORCID = "0000-0002-8895-3206")), person("Mattan S.", "Ben-Shachar", , "matanshm@post.bgu.ac.il", role = "aut", comment = c(ORCID = "0000-0002-4287-4801")), person("Brenton M.", "Wiernik", , "brenton@wiernik.org", role = "aut", comment = c(ORCID = "0000-0001-9560-6336")), person("Rémi", "Thériault", , "remi.theriault@mail.mcgill.ca", role = "ctb", comment = c(ORCID = "0000-0003-4315-6788")), person("Thomas J.", "Faulkenberry", , "faulkenberry@tarleton.edu", role = "rev"), person("Robert", "Garrett", , "rcg4@illinois.edu", role = "rev") ) Maintainer: Etienne Bacher Description: A lightweight package to assist in key steps involved in any data analysis workflow: (1) wrangling the raw data to get it in the needed form, (2) applying preprocessing steps and statistical transformations, and (3) compute statistical summaries of data properties and distributions. It is also the data wrangling backend for packages in 'easystats' ecosystem. References: Patil et al. (2022) . License: MIT + file LICENSE URL: https://easystats.github.io/datawizard/ BugReports: https://github.com/easystats/datawizard/issues Depends: R (>= 4.0) Imports: insight (>= 1.4.6), stats, utils Suggests: bayestestR, boot, BH, brms, curl, data.table, dplyr (>= 1.1), effectsize, emmeans, fixest, gamm4, ggplot2 (>= 3.5.0), gt, haven, httr, knitr, lme4, mediation, modelbased, nanoparquet, openssl, parameters (>= 0.21.7), performance (>= 0.14.0), poorman (>= 0.2.7), psych, readxl, readr, rio, rmarkdown, rstanarm, see, testthat (>= 3.2.1), tibble, tidyr, tinytable (>= 0.13.0), withr VignetteBuilder: knitr Encoding: UTF-8 Language: en-US Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.3 Config/testthat/edition: 3 Config/testthat/parallel: true Config/Needs/website: easystats/easystatstemplate ================================================ FILE: LICENSE ================================================ YEAR: 2023 COPYRIGHT HOLDER: datawizard authors ================================================ FILE: LICENSE.md ================================================ # MIT License Copyright (c) 2023 datawizard authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: NAMESPACE ================================================ # Generated by roxygen2: do not edit by hand S3method(as.data.frame,datawizard_crosstabs) S3method(as.data.frame,datawizard_tables) S3method(as.double,parameters_kurtosis) S3method(as.double,parameters_skewness) S3method(as.double,parameters_smoothness) S3method(as.numeric,parameters_kurtosis) S3method(as.numeric,parameters_skewness) S3method(as.numeric,parameters_smoothness) S3method(as.prop.table,datawizard_crosstab) S3method(as.prop.table,datawizard_crosstabs) S3method(as.table,datawizard_crosstab) S3method(as.table,datawizard_crosstabs) S3method(as.table,datawizard_table) S3method(as.table,datawizard_tables) S3method(assign_labels,character) S3method(assign_labels,data.frame) S3method(assign_labels,default) S3method(assign_labels,factor) S3method(assign_labels,numeric) S3method(categorize,data.frame) S3method(categorize,default) S3method(categorize,factor) S3method(categorize,grouped_df) S3method(categorize,numeric) S3method(center,AsIs) S3method(center,Date) S3method(center,character) S3method(center,data.frame) S3method(center,default) S3method(center,factor) S3method(center,grouped_df) S3method(center,logical) S3method(center,numeric) S3method(coef_var,default) S3method(coef_var,numeric) S3method(convert_na_to,character) S3method(convert_na_to,data.frame) S3method(convert_na_to,default) S3method(convert_na_to,factor) S3method(convert_na_to,numeric) S3method(convert_to_na,Date) S3method(convert_to_na,character) S3method(convert_to_na,data.frame) S3method(convert_to_na,default) S3method(convert_to_na,factor) S3method(convert_to_na,logical) S3method(convert_to_na,numeric) S3method(data_arrange,default) S3method(data_arrange,grouped_df) S3method(data_duplicated,data.frame) S3method(data_duplicated,grouped_df) S3method(data_extract,data.frame) S3method(data_filter,data.frame) S3method(data_filter,grouped_df) S3method(data_merge,data.frame) S3method(data_merge,list) S3method(data_modify,data.frame) S3method(data_modify,default) S3method(data_modify,grouped_df) S3method(data_peek,data.frame) S3method(data_summary,data.frame) S3method(data_summary,default) S3method(data_summary,grouped_df) S3method(data_summary,matrix) S3method(data_tabulate,data.frame) S3method(data_tabulate,default) S3method(data_tabulate,grouped_df) S3method(data_unique,data.frame) S3method(data_unique,grouped_df) S3method(describe_distribution,character) S3method(describe_distribution,data.frame) S3method(describe_distribution,default) S3method(describe_distribution,factor) S3method(describe_distribution,grouped_df) S3method(describe_distribution,list) S3method(describe_distribution,numeric) S3method(display,data_codebook) S3method(display,datawizard_crosstab) S3method(display,datawizard_crosstabs) S3method(display,datawizard_table) S3method(display,datawizard_tables) S3method(display,parameters_distribution) S3method(format,data_codebook) S3method(format,datawizard_crosstab) S3method(format,datawizard_table) S3method(format,dw_data_peek) S3method(format,dw_groupmeans) S3method(format,parameters_distribution) S3method(kurtosis,data.frame) S3method(kurtosis,default) S3method(kurtosis,matrix) S3method(kurtosis,numeric) S3method(labels_to_levels,data.frame) S3method(labels_to_levels,default) S3method(labels_to_levels,factor) S3method(makepredictcall,dw_transformer) S3method(means_by_group,data.frame) S3method(means_by_group,default) S3method(means_by_group,numeric) S3method(normalize,data.frame) S3method(normalize,factor) S3method(normalize,grouped_df) S3method(normalize,matrix) S3method(normalize,numeric) S3method(plot,parameters_distribution) S3method(plot,visualisation_recipe) S3method(print,data_codebook) S3method(print,data_seek) S3method(print,datawizard_crosstab) S3method(print,datawizard_crosstabs) S3method(print,datawizard_table) S3method(print,datawizard_tables) S3method(print,dw_data_peek) S3method(print,dw_data_summary) S3method(print,dw_groupmeans) S3method(print,dw_groupmeans_list) S3method(print,dw_transformer) S3method(print,parameters_distribution) S3method(print,parameters_kurtosis) S3method(print,parameters_skewness) S3method(print,visualisation_recipe) S3method(print_html,data_codebook) S3method(print_html,datawizard_crosstab) S3method(print_html,datawizard_crosstabs) S3method(print_html,datawizard_table) S3method(print_html,datawizard_tables) S3method(print_html,dw_data_peek) S3method(print_html,parameters_distribution) S3method(print_md,data_codebook) S3method(print_md,datawizard_crosstab) S3method(print_md,datawizard_crosstabs) S3method(print_md,datawizard_table) S3method(print_md,datawizard_tables) S3method(print_md,dw_data_peek) S3method(print_md,parameters_distribution) S3method(ranktransform,data.frame) S3method(ranktransform,factor) S3method(ranktransform,grouped_df) S3method(ranktransform,numeric) S3method(recode_values,character) S3method(recode_values,data.frame) S3method(recode_values,default) S3method(recode_values,factor) S3method(recode_values,numeric) S3method(replace_nan_inf,data.frame) S3method(replace_nan_inf,default) S3method(rescale,data.frame) S3method(rescale,default) S3method(rescale,grouped_df) S3method(rescale,numeric) S3method(reverse,data.frame) S3method(reverse,default) S3method(reverse,factor) S3method(reverse,grouped_df) S3method(reverse,numeric) S3method(rowid_as_column,default) S3method(rowid_as_column,grouped_df) S3method(skewness,data.frame) S3method(skewness,default) S3method(skewness,matrix) S3method(skewness,numeric) S3method(slide,data.frame) S3method(slide,default) S3method(slide,numeric) S3method(smoothness,data.frame) S3method(smoothness,default) S3method(smoothness,numeric) S3method(standardize,AsIs) S3method(standardize,Date) S3method(standardize,Surv) S3method(standardize,bcplm) S3method(standardize,biglm) S3method(standardize,brmsfit) S3method(standardize,character) S3method(standardize,clm2) S3method(standardize,data.frame) S3method(standardize,datagrid) S3method(standardize,default) S3method(standardize,double) S3method(standardize,factor) S3method(standardize,fixest) S3method(standardize,grouped_df) S3method(standardize,integer) S3method(standardize,logical) S3method(standardize,matrix) S3method(standardize,mediate) S3method(standardize,mixor) S3method(standardize,numeric) S3method(standardize,visualisation_matrix) S3method(standardize,wbgee) S3method(standardize,wbm) S3method(summary,parameters_kurtosis) S3method(summary,parameters_skewness) S3method(to_factor,Date) S3method(to_factor,character) S3method(to_factor,data.frame) S3method(to_factor,default) S3method(to_factor,double) S3method(to_factor,factor) S3method(to_factor,haven_labelled) S3method(to_factor,logical) S3method(to_factor,numeric) S3method(to_numeric,Date) S3method(to_numeric,POSIXct) S3method(to_numeric,POSIXlt) S3method(to_numeric,POSIXt) S3method(to_numeric,character) S3method(to_numeric,data.frame) S3method(to_numeric,default) S3method(to_numeric,double) S3method(to_numeric,factor) S3method(to_numeric,haven_labelled) S3method(to_numeric,logical) S3method(to_numeric,numeric) S3method(unnormalize,data.frame) S3method(unnormalize,default) S3method(unnormalize,grouped_df) S3method(unnormalize,numeric) S3method(unstandardize,array) S3method(unstandardize,character) S3method(unstandardize,data.frame) S3method(unstandardize,datagrid) S3method(unstandardize,factor) S3method(unstandardize,grouped_df) S3method(unstandardize,matrix) S3method(unstandardize,numeric) S3method(unstandardize,visualisation_matrix) S3method(winsorize,character) S3method(winsorize,data.frame) S3method(winsorize,factor) S3method(winsorize,logical) S3method(winsorize,numeric) export(adjust) export(as.prop.table) export(assign_labels) export(categorize) export(center) export(centre) export(change_scale) export(coef_var) export(coerce_to_numeric) export(colnames_to_row) export(column_as_rownames) export(contr.deviation) export(convert_na_to) export(convert_to_na) export(data_addprefix) export(data_addsuffix) export(data_adjust) export(data_arrange) export(data_codebook) export(data_duplicated) export(data_extract) export(data_filter) export(data_group) export(data_join) export(data_match) export(data_merge) export(data_modify) export(data_partition) export(data_peek) export(data_read) export(data_relocate) export(data_remove) export(data_rename) export(data_rename_rows) export(data_reorder) export(data_replicate) export(data_restoretype) export(data_rotate) export(data_seek) export(data_select) export(data_separate) export(data_summary) export(data_tabulate) export(data_to_long) export(data_to_wide) export(data_transpose) export(data_ungroup) export(data_unique) export(data_unite) export(data_write) export(degroup) export(demean) export(describe_distribution) export(detrend) export(display) export(distribution_coef_var) export(distribution_mode) export(empty_columns) export(empty_rows) export(extract_column_names) export(find_columns) export(kurtosis) export(labels_to_levels) export(mean_sd) export(means_by_group) export(median_mad) export(normalize) export(print_html) export(print_md) export(ranktransform) export(recode_into) export(recode_values) export(remove_empty) export(remove_empty_columns) export(remove_empty_rows) export(replace_nan_inf) export(rescale) export(rescale_weights) export(reshape_ci) export(reshape_longer) export(reshape_wider) export(reverse) export(reverse_scale) export(row_count) export(row_means) export(row_sums) export(row_to_colnames) export(rowid_as_column) export(rownames_as_column) export(skewness) export(slide) export(smoothness) export(standardise) export(standardize) export(text_concatenate) export(text_format) export(text_fullstop) export(text_lastchar) export(text_paste) export(text_remove) export(text_wrap) export(to_factor) export(to_numeric) export(unnormalize) export(unstandardise) export(unstandardize) export(visualisation_recipe) export(weighted_mad) export(weighted_mean) export(weighted_median) export(weighted_sd) export(winsorize) importFrom(insight,display) importFrom(insight,print_html) importFrom(insight,print_md) importFrom(stats,makepredictcall) ================================================ FILE: NEWS.md ================================================ # datawizard 1.3.1 CHANGES * `data_summary()` now allows expressions to return more than one summary value. For each value, a new column is created. Additionally, the optional `suffix` argument controls the naming of these columns; if `suffix = NULL`, column names are auto-generated (e.g., with numeric suffixes). * `standardize()` now works on `fixest` estimations (#665). * `data_read()` and `data_write()` gain a `password` argument, to encrypt and decrypt data files. This currently only works for R file formats (`.rda`, `.rds`, and `.rdata`). Data encryption is based on the AES-GCM algorithm using the `openssl::aes_gcm_encrypt()` function (#675). FIXES * Fix a test due to R-devel change (#677). # datawizard 1.3.0 BREAKING CHANGES * Argument `values_fill` in `data_to_wide()` is now defunct, because it did not work as intended (#645). * `data_to_wide()` no longer removes empty columns that were created after widening data frames, to behave similarly to `tidyr::pivot_wider()` (#645). CHANGES * `data_tabulate()` now saves the table of proportions for crosstables as attribute, accessible via the new `as.prop.table()` method (#656). * Due to changes in the package `insight`, `data_tabulate()` no longer prints decimals when all values in a column are integers (#641). * Argument `values_from` in `data_to_wide()` now supports select-helpers like the `select` argument in other `{datawizard}` functions (#645). * Added a `display()` method for `data_codebook()` (#646). * `display()` methods now support the `{tinytable}` package. Use `format = "tt"` to export tables as `tinytable` objects (#646). * Improved performance for several functions that process grouped data frames when the input is a grouped `tibble` (#651). BUG FIXES * Fixed an issue when `demean()`ing nested structures with more than 2 grouping variables (#635). * Fixed an issue when `demean()`ing crossed structures with more than 2 grouping variables (#638). * Fixed issue in `data_to_wide()` with multiple variables assigned in `values_from` when IDs were not balanced (equally spread across observations) (#644). * Fixed issue in `data_replicate()` when data frame had only one column to replicate (#654). # datawizard 1.2.0 BREAKING CHANGES * The following deprecated arguments have been removed (#603): - `drop_na` in `data_match()` - `safe`, `pattern`, and `verbose` in `data_rename()` CHANGES * `data_read()` and `data_write()` now support the `.parquet` file format, via the *nanoparquet* package (#625). * `data_tabulate()` gets a `display()` method (#627). * `data_tabulate()` gets an `as.table()` method to coerce the frequency or contingency table into a (list of) `table()` object(s). This can be useful for further statistical analysis, e.g. in combination with `chisq.test()` (#629). * The `print()` method for `data_tabulate()` now appears in the documentation, making the `big_mark` argument visible (#627). BUG FIXES * Fixed an issue when printing cross tables using `data_tabulate(by = ...)`, which was caused by the recent changes in `insight::export_table()`. * Fixed another issue when printing cross tables using `data_tabulate(by = ...)`, when more than one variable was selected for `select` (#630). * Fixed typo in the documentation of `data_match()`. # datawizard 1.1.0 BREAKING CHANGES * `data_read()` now also returns Bayesian models from packages *brms* and *rstanarm* as original model objects, and no longer coerces them into data frames (#606). * The output format of `describe_distribution()` on grouped data has changed. Before, it printed one table per group combination. Now, it prints a single table with group columns at the start (#610). * The output format of `describe_distribution()` when confidence intervals are requested has changed. Now, for each centrality measure a confidence interval is calculated (#617). * `data_modify()` now always uses values of a vector for a modified or newly created variable, and no longer tries to detect whether a character value possibly contains an expression. To allow expression provided as string (or character vectors), use the helper-function `as_expr()`. Only literal expressions or strings wrapped in `as_expr()` will be evaluated as expressions, everything else will be treated as vector with values for new variables (#605). CHANGES * `display()` is now re-exported from package *insight*. * `data_read()` and `data_write()` now rely on base-R functions for files of type `.rds`, `.rda` or `.rdata`. Thus, package *rio* is no longer required to be installed for these file types (#607). * `data_codebook()` gives an informative warning when no column names matched the selection pattern (#601). * `data_to_long()` now errors when columns selected to reshape do not exist in the data, to avoid nonsensical results that could be missed (#602). * New argument `by` in `describe_distribution()` (#604). * `describe_distribution()` now gives informative errors when column names in the input data frame conflict with column from the output table (#612). * The methods for `parameters_distribution` objects are now defined in `datawizard` (they were previously in `parameters`) (#613). BUG FIXES * Fixed bug in `data_to_wide()`, where new column names in `names_from` were ignored when that column only contained one unique value. * Fixed bug in `describe_distribution()` when some group combinations didn't appear in the data (#609). * Fixed bug in `describe_distribution()` when more than one value for the `centrality` argument were specified (#617). * Fixed bug in `describe_distribution()` where setting `verbose = FALSE` didn't hide some warnings (#617). * Fixed warning in `data_summary()` when a variable had the same name as another object in the global environment (#585). # datawizard 1.0.2 BUG FIXES * Fixed failing R CMD check on ATLAS, noLD, and OpenBLAS due to small numerical differences (#592). # datawizard 1.0.1 BUG FIXES * Fixed issue in `data_arrange()` for data frames that only had one column. Formerly, the data frame was coerced into a vector, now the data frame class is preserved. * Fixed issue in R-devel (4.5.0) due to a change in how `grep()` handles logical arguments with missing values (#588). # datawizard 1.0.0 BREAKING CHANGES AND DEPRECATIONS * *datawizard* now requires R >= 4.0 (#515). * Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na` instead (#556). * In `data_rename()` (#567): - argument `pattern` is deprecated. Use `select` instead. - argument `safe` is deprecated. The function now errors when `select` contains unknown column names. - when `replacement` is `NULL`, an error is now thrown (previously, column indices were used as new names). - if `select` (previously `pattern`) is a named vector, then all elements must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors. * Order of arguments `by` and `probability_weights` in `rescale_weights()` has changed, because for `method = "kish"`, the `by` argument is optional (#575). * The name of the rescaled weights variables in `rescale_weights()` have been renamed. `pweights_a` and `pweights_b` are now named `rescaled_weights_a` and `rescaled_weights_b` (#575). * `print()` methods for `data_tabulate()` with multiple sub-tables (i.e. when length of `by` was > 1) were revised. Now, an integrated table instead of multiple tables is returned. Furthermore, `print_html()` did not work, which was also fixed now (#577). * `demean()` (and `degroup()`) gets an `append` argument that defaults to `TRUE`, to append the centered variables to the original data frame, instead of returning the de- and group-meaned variables only. Use `append = FALSE` to for the previous default behaviour (i.e. only returning the newly created variables) (#579). CHANGES * `rescale_weights()` gets a `method` argument, to choose method to rescale weights. Options are `"carle"` (the default) and `"kish"` (#575). * The `select` argument, which is available in different functions to select variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`) (#551). * New function `row_sums()`, to calculate row sums (optionally with minimum amount of valid values), as complement to `row_means()` (#552). * New function `row_count()`, to count specific values row-wise (#553). * `data_read()` no longer shows warning about forthcoming breaking changes in upstream packages when reading `.RData` files (#557). * `data_modify()` now recognizes `n()`, for example to create an index for data groups with `1:n()` (#535). * The `replacement` argument in `data_rename()` now supports glue-styled tokens (#563). * `data_summary()` also accepts the results of `bayestestR::ci()` as summary function (#483). * `ranktransform()` has a new argument `zeros` to determine how zeros should be handled when `sign = TRUE` (#573). BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute CIs. Instead, it warns the user and returns `NA` (#550). * `data_read()` preserves variable types when importing files from `rds` or `rdata` format (#558). # datawizard 0.13.0 BREAKING CHANGES * `data_rename()` now errors when the `replacement` argument contains `NA` values or empty strings (#539). * Removed deprecated functions `get_columns()`, `data_find()`, `format_text()` (#546). * Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546). * The default value for the argument `dummy_factors` in `to_numeric()` has changed from `TRUE` to `FALSE` (#544). CHANGES * The `pattern` argument in `data_rename()` can also be a named vector. In this case, names are used as values for the `replacement` argument (i.e. `pattern` can be a character vector using ` = ""`). * `categorize()` gains a new `breaks` argument, to decide whether breaks are inclusive or exclusive (#548). * The `labels` argument in `categorize()` gets two new options, `"range"` and `"observed"`, to use the range of categorized values as labels (i.e. factor levels) (#548). * Minor additions to `reshape_ci()` to work with forthcoming changes in the `{bayestestR}` package. # datawizard 0.12.3 CHANGES * `demean()` (and `degroup()`) now also work for nested designs, if argument `nested = TRUE` and `by` specifies more than one variable (#533). * Vignettes are no longer provided in the package, they are now only available on the website. There is only one "Overview" vignette available in the package, it contains links to the other vignettes on the website. This is because there are CRAN errors occurring when building vignettes on macOS and we couldn't determine the cause after multiple patch releases (#534). # datawizard 0.12.2 * Remove `htmltools` from `Suggests` in an attempt of fixing an error in CRAN checks due to failures to build a vignette (#528). # datawizard 0.12.1 This is a patch release to fix one error on CRAN checks occurring because of a missing package namespace in one of the vignettes. # datawizard 0.12.0 BREAKING CHANGES * The argument `include_na` in `data_tabulate()` and `data_summary()` has been renamed into `remove_na`. Consequently, to mimic former behaviour, `FALSE` and `TRUE` need to be switched (i.e. `remove_na = TRUE` is equivalent to the former `include_na = FALSE`). * Class names for objects returned by `data_tabulate()` have been changed to `datawizard_table` and `datawizard_crosstable` (resp. the plural forms, `*_tables`), to provide a clearer and more consistent naming scheme. CHANGES * `data_select()` can directly rename selected variables when a named vector is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`. * `data_tabulate()` gains an `as.data.frame()` method, to return the frequency table as a data frame. The structure of the returned object is a nested data frame, where the first column contains name of the variable for which frequencies were calculated, and the second column contains the frequency table. * `demean()` (and `degroup()`) now also work for cross-classified designs, or more generally, for data with multiple grouping or cluster variables (i.e. `by` can now specify more than one variable). # datawizard 0.11.0 BREAKING CHANGES * Arguments named `group` or `group_by` are deprecated and will be removed in a future release. Please use `by` instead. This affects the following functions in *datawizard* (#502). * `data_partition()` * `demean()` and `degroup()` * `means_by_group()` * `rescale_weights()` * Following aliases are deprecated and will be removed in a future release (#504): * `get_columns()`, use `data_select()` instead. * `data_find()` and `find_columns()`, use `extract_column_names()` instead. * `format_text()`, use `text_format()` instead. CHANGES * `recode_into()` is more relaxed regarding checking the type of `NA` values. If you recode into a numeric variable, and one of the recode values is `NA`, you no longer need to use `NA_real_` for numeric `NA` values. * Improved documentation for some functions. BUG FIXES * `data_to_long()` did not work for data frame where columns had attributes (like labelled data). # datawizard 0.10.0 BREAKING CHANGES * The following arguments were deprecated in 0.5.0 and are now removed: * in `data_to_wide()`: `colnames_from`, `rows_from`, `sep` * in `data_to_long()`: `colnames_to` * in `data_partition()`: `training_proportion` NEW FUNCTIONS * `data_summary()`, to compute summary statistics of (grouped) data frames. * `data_replicate()`, to expand a data frame by replicating rows based on another variable that contains the counts of replications per row. CHANGES * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify variables at specific positions or based on logical conditions. * `data_tabulate()` was revised and gets several new arguments: a `weights` argument, to compute weighted frequency tables. `include_na` allows to include or omit missing values from the table. Furthermore, a `by` argument was added, to compute crosstables (#479, #481). # datawizard 0.9.1 CHANGES * `rescale()` gains `multiply` and `add` arguments, to expand ranges by a given factor or value. * `to_factor()` and `to_numeric()` now support class `haven_labelled`. BUG FIXES * `to_numeric()` now correctly deals with inversed factor levels when `preserve_levels = TRUE`. * `to_numeric()` inversed order of value labels when `dummy_factors = FALSE`. * `convert_to_na()` now preserves attributes for factors when `drop_levels = TRUE`. # datawizard 0.9.0 NEW FUNCTIONS * `row_means()`, to compute row means, optionally only for the rows with at least `min_valid` non-missing values. * `contr.deviation()` for sum-deviation contrast coding of factors. * `means_by_group()`, to compute mean values of variables, grouped by levels of specified factors. * `data_seek()`, to seek for variables in a data frame, based on their column names, variables labels, value labels or factor levels. Searching for labels only works for "labelled" data, i.e. when variables have a `label` or `labels` attribute. CHANGES * `recode_into()` gains an `overwrite` argument to skip overwriting already recoded cases when multiple recode patterns apply to the same case. * `recode_into()` gains an `preserve_na` argument to preserve `NA` values when recoding. * `data_read()` now passes the `encoding` argument to `data.table::fread()`. This allows to read files with non-ASCII characters. * `datawizard` moves from the GPL-3 license to the MIT license. * `unnormalize()` and `unstandardize()` now work with grouped data (#415). * `unnormalize()` now errors instead of emitting a warning if it doesn't have the necessary info (#415). BUG FIXES * Fixed issue in `labels_to_levels()` when values of labels were not in sorted order and values were not sequentially numbered. * Fixed issues in `data_write()` when writing labelled data into SPSS format and vectors were of different type as value labels. * Fixed issues in `data_write()` when writing labelled data into SPSS format for character vectors with missing value labels, but existing variable labels. * Fixed issue in `recode_into()` with probably wrong case number printed in the warning when several recode patterns match to one case. * Fixed issue in `recode_into()` when original data contained `NA` values and `NA` was not included in the recode pattern. * Fixed issue in `data_filter()` where functions containing a `=` (e.g. when naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as faulty syntax. * Fixed issue in `empty_column()` for strings with invalid multibyte strings. For such data frames or files, `empty_column()` or `data_read()` no longer fails. # datawizard 0.8.0 BREAKING CHANGES * The following re-exported functions from `{insight}` have now been removed: `object_has_names()`, `object_has_rownames()`, `is_empty_object()`, `compact_list()`, `compact_character()`. * Argument `na.rm` was renamed to `remove_na` throughout `{datawizard}` functions. `na.rm` is kept for backward compatibility, but will be deprecated and later removed in future updates. * The way expressions are defined in `data_filter()` was revised. The `filter` argument was replaced by `...`, allowing to separate multiple expression with a comma (which are then combined with `&`). Furthermore, expressions can now also be defined as strings, or be provided as character vectors, to allow string-friendly programming. CHANGES * Weighted-functions (`weighted_sd()`, `weighted_mean()`, ...) gain a `remove_na` argument, to remove or keep missing and infinite values. By default, `remove_na = TRUE`, i.e. missing and infinite values are removed by default. * `reverse_scale()`, `normalize()` and `rescale()` gain an `append` argument (similar to other data frame methods of transformation functions), to append recoded variables to the input data frame instead of overwriting existing variables. NEW FUNCTIONS * `rowid_as_column()` to complement `rownames_as_column()` (and to mimic `tibble::rowid_to_column()`). Note that its behavior is different from `tibble::rowid_to_column()` for grouped data. See the Details section in the docs. * `data_unite()`, to merge values of multiple variables into one new variable. * `data_separate()`, as counterpart to `data_unite()`, to separate a single variable into multiple new variables. * `data_modify()`, to create new variables, or modify or remove existing variables in a data frame. MINOR CHANGES * `to_numeric()` for variables of type `Date`, `POSIXct` and `POSIXlt` now includes the class name in the warning message. * Added a `print()` method for `center()`, `standardize()`, `normalize()` and `rescale()`. BUG FIXES * `standardize_parameters()` now works when the package namespace is in the model formula (#401). * `data_merge()` no longer yields a warning for `tibbles` when `join = "bind"`. * `center()` and `standardize()` did not work for grouped data frames (of class `grouped_df`) when `force = TRUE`. * The `data.frame` method of `describe_distribution()` returns `NULL` instead of an error if no valid variable were passed (for example a factor variable with `include_factors = FALSE`) (#421). # datawizard 0.7.1 BREAKING CHANGES * `add_labs()` was renamed into `assign_labels()`. Since `add_labs()` existed only for a few days, there will be no alias for backwards compatibility. NEW FUNCTIONS * `labels_to_levels()`, to use value labels of factors as their levels. MINOR CHANGES * `data_read()` now checks if the imported object actually is a data frame (or coercible to a data frame), and if not, no longer errors, but gives an informative warning of the type of object that was imported. BUG FIXES * Fix test for CRAN check on Mac OS arm64 # datawizard 0.7.0 BREAKING CHANGES * In selection patterns, expressions like `-var1:var3` to exclude all variables between `var1` and `var3` are no longer accepted. The correct expression is `-(var1:var3)`. This is for 2 reasons: * to be consistent with the behavior for numerics (`-1:2` is not accepted but `-(1:2)` is); * to be consistent with `dplyr::select()`, which throws a warning and only uses the first variable in the first expression. NEW FUNCTIONS * `recode_into()`, similar to `dplyr::case_when()`, to recode values from one or more variables into a new variable. * `mean_sd()` and `median_mad()` for summarizing vectors to their mean (or median) and a range of one SD (or MAD) above and below. * `data_write()` as counterpart to `data_read()`, to write data frames into CSV, SPSS, SAS, Stata files and many other file types. One advantage over existing functions to write data in other packages is that labelled (numeric) data can be converted into factors (with values labels used as factor levels) even for text formats like CSV and similar. This allows exporting "labelled" data into those file formats, too. * `add_labs()`, to manually add value and variable labels as attributes to variables. These attributes are stored as `"label"` and `"labels"` attributes, similar to the `labelled` class from the _haven_ package. MINOR CHANGES * `data_rename()` gets a `verbose` argument. * `winsorize()` now errors if the threshold is incorrect (previously, it provided a warning and returned the unchanged data). The argument `verbose` is now useless but is kept for backward compatibility. The documentation now contains details about the valid values for `threshold` (#357). * In all functions that have arguments `select` and/or `exclude`, there is now one warning per misspelled variable. The previous behavior was to have only one warning. * Fixed inconsistent behaviour in `standardize()` when only one of the arguments `center` or `scale` were provided (#365). * `unstandardize()` and `replace_nan_inf()` now work with select helpers (#376). * Added informative warning and error messages to `reverse()`. Furthermore, the docs now describe the `range` argument more clearly (#380). * `unnormalize()` errors with unexpected inputs (#383). BUG FIXES * `empty_columns()` (and therefore `remove_empty_columns()`) now correctly detects columns containing only `NA_character_` (#349). * Select helpers now work in custom functions when argument is called `select` (#356). * Fix unexpected warning in `convert_na_to()` when `select` is a list (#352). * Fixed issue with correct labelling of numeric variables with more than nine unique values and associated value labels. # datawizard 0.6.5 MAJOR CHANGES * Etienne Bacher is the new maintainer. MINOR CHANGES * `standardize()`, `center()`, `normalize()` and `rescale()` can be used in model formulas, similar to `base::scale()`. * `data_codebook()` now includes the proportion for each category/value, in addition to the counts. Furthermore, if data contains tagged `NA` values, these are included in the frequency table. BUG FIXES * `center(x)` now works correctly when `x` is a single value and either `reference` or `center` is specified (#324). * Fixed issue in `data_codebook()`, which failed for labelled vectors when values of labels were not in sorted order. # datawizard 0.6.4 NEW FUNCTIONS * `data_codebook()`: to generate codebooks of data frames. * New functions to deal with duplicates: `data_duplicated()` (keep all duplicates, including the first occurrence) and `data_unique()` (returns the data, excluding all duplicates except one instance of each, based on the selected method). MINOR CHANGES * `.data.frame` methods should now preserve custom attributes. * The `include_bounds` argument in `normalize()` can now also be a numeric value, defining the limit to the upper and lower bound (i.e. the distance to 1 and 0). * `data_filter()` now works with grouped data. BUG FIXES * `data_read()` no longer prints message for empty columns when the data actually had no empty columns. * `data_to_wide()` now drops columns that are not in `id_cols` (if specified), `names_from`, or `values_from`. This is the behaviour observed in `tidyr::pivot_wider()`. # datawizard 0.6.3 MAJOR CHANGES * There is a new publication about the `{datawizard}` package: * Fixes failing tests due to changes in `R-devel`. * `data_to_long()` and `data_to_wide()` have had significant performance improvements, sometimes as high as a ten-fold speedup. MINOR CHANGES * When column names are misspelled, most functions now suggest which existing columns possibly could be meant. * Miscellaneous performance gains. * `convert_to_na()` now requires argument `na` to be of class 'Date' to convert specific dates to `NA`. For example, `convert_to_na(x, na = "2022-10-17")` must be changed to `convert_to_na(x, na = as.Date("2022-10-17"))`. BUG FIXES * `data_to_long()` and `data_to_wide()` now correctly keep the `date` format. # datawizard 0.6.2 BREAKING CHANGES * Methods for grouped data frames (`.grouped_df`) no longer support `dplyr::group_by()` for `{dplyr}` before version `0.8.0`. * `empty_columns()` and `remove_empty_columns()` now also remove columns that contain only empty characters. Likewise, `empty_rows()` and `remove_empty_rows()` remove observations that completely have missing or empty character values. MINOR CHANGES * `data_read()` gains a `convert_factors` argument, to turn off automatic conversion from numeric variables into factors. BUG FIXES * `data_arrange()` now works with data frames that were grouped using `data_group()` (#274). # datawizard 0.6.1 * Updates tests for upcoming changes in the `{tidyselect}` package (#267). # datawizard 0.6.0 BREAKING CHANGES * The minimum needed R version has been bumped to `3.6`. * Following deprecated functions have been removed: `data_cut()`, `data_recode()`, `data_shift()`, `data_reverse()`, `data_rescale()`, `data_to_factor()`, `data_to_numeric()` * New `text_format()` alias is introduced for `format_text()`, latter of which will be removed in the next release. * New `recode_values()` alias is introduced for `change_code()`, latter of which will be removed in the next release. * `data_merge()` now errors if columns specified in `by` are not in both datasets. * Using negative values in arguments `select` and `exclude` now removes the columns from the selection/exclusion. The previous behavior was to start the selection/exclusion from the end of the dataset, which was inconsistent with the use of "-" with other selecting possibilities. NEW FUNCTIONS * `data_peek()`: to peek at values and type of variables in a data frame. * `coef_var()`: to compute the coefficient of variation. CHANGES * `data_filter()` will give more informative messages on malformed syntax of the `filter` argument. * It is now possible to use curly brackets to pass variable names to `data_filter()`, like the following example. See examples section in the documentation of `data_filter()`. * The `regex` argument was added to functions that use select-helpers and did not already have this argument. * Select helpers `starts_with()`, `ends_with()`, and `contains()` now accept several patterns, e.g `starts_with("Sep", "Petal")`. * Arguments `select` and `exclude` that are present in most functions have been improved to work in loops and in custom functions. For example, the following code now works: ```r foo <- function(data) { i <- "Sep" find_columns(data, select = starts_with(i)) } foo(iris) for (i in c("Sepal", "Sp")) { head(iris) |> find_columns(select = starts_with(i)) |> print() } ``` * There is now a vignette summarizing the various ways to select or exclude variables in most `{datawizard}` functions. # datawizard 0.5.1 * Fixes failing tests due to `{poorman}` update. # datawizard 0.5.0 MAJOR CHANGES * Following statistical transformation functions have been renamed to not have `data_*()` prefix, since they do not work exclusively with data frames, but are typically first of all used with vectors, and therefore had misleading names: - `data_cut()` -> `categorize()` - `data_recode()` -> `change_code()` - `data_shift()` -> `slide()` - `data_reverse()` -> `reverse()` - `data_rescale()` -> `rescale()` - `data_to_factor()` -> `to_factor()` - `data_to_numeric()` -> `to_numeric()` Note that these functions also have `.data.frame()` methods and still work for data frames as well. Former function names are still available as aliases, but will be deprecated and removed in a future release. * Bumps the needed minimum R version to `3.5`. * Removed deprecated function `data_findcols()`. Please use its replacement, `data_find()`. * Removed alias `extract()` for `data_extract()` function since it collided with `tidyr::extract()`. * Argument `training_proportion` in `data_partition()` is deprecated. Please use `proportion` now. * Given his continued and significant contributions to the package, Etienne Bacher (@etiennebacher) is now included as an author. * `unstandardise()` now works for `center(x)` * `unnormalize()` now works for `change_scale(x)` * `reshape_wider()` now follows more consistently `tidyr::pivot_wider()` syntax. Arguments `colnames_from`, `sep`, and `rows_from` are deprecated and should be replaced by `names_from`, `names_sep`, and `id_cols` respectively. `reshape_wider()` also gains an argument `names_glue` (#182, #198). * Similarly, `reshape_longer()` now follows more consistently `tidyr::pivot_longer()` syntax. Argument `colnames_to` is deprecated and should be replaced by `names_to`. `reshape_longer()` also gains new arguments: `names_prefix`, `names_sep`, `names_pattern`, and `values_drop_na` (#189). CHANGES * Some of the text formatting helpers (like `text_concatenate()`) gain an `enclose` argument, to wrap text elements with surrounding characters. * `winsorize` now accepts "raw" and "zscore" methods (in addition to "percentile"). Additionally, when `robust` is set to `TRUE` together with `method = "zscore"`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47). * `convert_na_to` now accepts numeric replacements on character vectors and single replacement for multiple vector classes. (@rempsyc, #214). * `data_partition()` now allows to create multiple partitions from the data, returning multiple training and a remaining test set. * Functions like `center()`, `normalize()` or `standardize()` no longer fail when data contains infinite values (`Inf`). NEW FUNCTIONS * `row_to_colnames()` and `colnames_to_row()` to move a row to column names, and column names to row (@etiennebacher, #169). * `data_arrange()` to sort the rows of a dataframe according to the values of the selected columns. BUG FIXES * Fixed wrong column names in `data_to_wide()` (#173). # datawizard 0.4.1 BREAKING * Added the `standardize.default()` method (moved from package **effectsize**), to be consistent in that the default-method now is in the same package as the generic. `standardize.default()` behaves exactly like in **effectsize** and particularly works for regression model objects. **effectsize** now re-exports `standardize()` from **datawizard**. NEW FUNCTIONS * `data_shift()` to shift the value range of numeric variables. * `data_recode()` to recode old into new values. * `data_to_factor()` as counterpart to `data_to_numeric()`. * `data_tabulate()` to create frequency tables of variables. * `data_read()` to read (import) data files (from text, or foreign statistical packages). * `unnormalize()` as counterpart to `normalize()`. This function only works for variables that have been normalized with `normalize()`. * `data_group()` and `data_ungroup()` to create grouped data frames, or to remove the grouping information from grouped data frames. CHANGES * `data_find()` was added as alias to `find_colums()`, to have consistent name patterns for the **datawizard** functions. `data_findcols()` will be removed in a future update and usage is discouraged. * The `select` argument (and thus, also the `exclude` argument) now also accepts functions testing for logical conditions, e.g. `is.numeric()` (or `is.numeric`), or any user-defined function that selects the variables for which the function returns `TRUE` (like: `foo <- function(x) mean(x) > 3`). * Arguments `select` and `exclude` now allow the negation of select-helpers, like `-ends_with("")`, `-is.numeric` or `-Sepal.Width:Petal.Length`. * Many functions now get a `.default` method, to capture unsupported classes. This now yields a message and returns the original input, and hence, the `.data.frame` methods won't stop due to an error. * The `filter` argument in `data_filter()` can also be a numeric vector, to indicate row indices of those rows that should be returned. * `convert_to_na()` gets methods for variables of class `logical` and `Date`. * `convert_to_na()` for factors (and data frames) gains a `drop_levels` argument, to drop unused levels that have been replaced by `NA`. * `data_to_numeric()` gains two more arguments, `preserve_levels` and `lowest`, to give better control of conversion of factors. BUG FIXES * When logicals were passed to `center()` or `standardize()` and `force = TRUE`, these were not properly converted to numeric variables. # datawizard 0.4.0 MAJOR CHANGES * `data_match()` now returns filtered data by default. Old behavior (returning rows indices) can be set by setting `return_indices = TRUE`. * The following functions are now re-exported from `{insight}` package: `object_has_names()`, `object_has_rownames()`, `is_empty_object()`, `compact_list()`, `compact_character()` * `data_findcols()` will become deprecated in future updates. Please use the new replacements `find_columns()` and `get_columns()`. * The vignette *Analysing Longitudinal or Panel Data* has now moved to [parameters package](https://easystats.github.io/parameters/articles/demean.html). NEW FUNCTIONS * To convert rownames to a column, and *vice versa*: `rownames_as_column()` and `column_as_rownames()` (@etiennebacher, #80). * `find_columns()` and `get_columns()` to find column names or retrieve subsets of data frames, based on various select-methods (including select-helpers). These function will supersede `data_findcols()` in the future. * `data_filter()` as complement for `data_match()`, which works with logical expressions for filtering rows of data frames. * For computing weighted centrality measures and dispersion: `weighted_mean()`, `weighted_median()`, `weighted_sd()` and `weighted_mad()`. * To replace `NA` in vectors and dataframes: `convert_na_to()` (@etiennebacher, #111). MINOR CHANGES * The `select` argument in several functions (like `data_remove()`, `reshape_longer()`, or `data_extract()`) now allows the use of select-helpers for selecting variables based on specific patterns. * `data_extract()` gains new arguments to allow type-safe return values, i.e. *always* return a vector *or* a data frame. Thus, `data_extract()` can now be used to select multiple variables or pull a single variable from data frames. * `data_match()` gains a `match` argument, to indicate with which logical operation matching results should be combined. * Improved support for *labelled data* for many functions, i.e. returned data frame will preserve value and variable label attributes, where possible and applicable. * `describe_distribution()` now works with lists (@etiennebacher, #105). * `data_rename()` doesn't use `pattern` anymore to rename the columns if `replacement` is not provided (@etiennebacher, #103). * `data_rename()` now adds a suffix to duplicated names in `replacement` (@etiennebacher, #103). BUG FIXES * `data_to_numeric()` produced wrong results for factors when `dummy_factors = TRUE` and factor contained missing values. * `data_match()` produced wrong results when data contained missing values. * Fixed CRAN check issues in `data_extract()` when more than one variable was extracted from a data frame. # datawizard 0.3.0 NEW FUNCTIONS * To find or remove empty rows and columns in a data frame: `empty_rows()`, `empty_columns()`, `remove_empty_rows()`, `remove_empty_columns()`, and `remove_empty`. * To check for names: `object_has_names()` and `object_has_rownames()`. * To rotate data frames: `data_rotate()`. * To reverse score variables: `data_reverse()`. * To merge/join multiple data frames: `data_merge()` (or its alias `data_join()`). * To cut (recode) data into groups: `data_cut()`. * To replace specific values with `NA`s: `convert_to_na()`. * To replace `Inf` and `NaN` values with `NA`s: `replace_nan_inf()`. - Arguments `cols`, `before` and `after` in `data_relocate()` can now also be numeric values, indicating the position of the destination column. # datawizard 0.2.3 - New functions: * to work with lists: `is_empty_object()` and `compact_list()` * to work with strings: `compact_character()` # datawizard 0.2.2 - New function `data_extract()` (or its alias `extract()`) to pull single variables from a data frame, possibly naming each value by the row names of that data frame. - `reshape_ci()` gains a `ci_type` argument, to reshape data frames where CI-columns have prefixes other than `"CI"`. - `standardize()` and `center()` gain arguments `center` and `scale`, to define references for centrality and deviation that are used when centering or standardizing variables. - `center()` gains the arguments `force` and `reference`, similar to `standardize()`. - The functionality of the `append` argument in `center()` and `standardize()` was revised. This made the `suffix` argument redundant, and thus it was removed. - Fixed issue in `standardize()`. - Fixed issue in `data_findcols()`. # datawizard 0.2.1 - Exports `plot` method for `visualisation_recipe()` objects from `{see}` package. - `centre()`, `standardise()`, `unstandardise()` are exported as aliases for `center()`, `standardize()`, `unstandardize()`, respectively. # datawizard 0.2.0.1 - This is mainly a maintenance release that addresses some issues with conflicting namespaces. # datawizard 0.2.0 - New function: `visualisation_recipe()`. - The following function has now moved to *performance* package: `check_multimodal()`. - Minor updates to documentation, including a new vignette about `demean()`. # datawizard 0.1.0 * First release. ================================================ FILE: R/adjust.R ================================================ #' Adjust data for the effect of other variable(s) #' #' This function can be used to adjust the data for the effect of other #' variables present in the dataset. It is based on an underlying fitting of #' regressions models, allowing for quite some flexibility, such as including #' factors as random effects in mixed models (multilevel partialization), #' continuous variables as smooth terms in general additive models (non-linear #' partialization) and/or fitting these models under a Bayesian framework. The #' values returned by this function are the residuals of the regression models. #' Note that a regular correlation between two "adjusted" variables is #' equivalent to the partial correlation between them. #' #' @param data A data frame. #' @param effect Character vector of column names to be adjusted for (regressed #' out). If `NULL` (the default), all variables will be selected. #' @param multilevel If `TRUE`, the factors are included as random factors. #' Else, if `FALSE` (default), they are included as fixed effects in the #' simple regression model. #' @param additive If `TRUE`, continuous variables as included as smooth terms #' in additive models. The goal is to regress-out potential non-linear #' effects. #' @param bayesian If `TRUE`, the models are fitted under the Bayesian framework #' using `rstanarm`. #' @param keep_intercept If `FALSE` (default), the intercept of the model is #' re-added. This avoids the centering around 0 that happens by default #' when regressing out another variable (see the examples below for a #' visual representation of this). #' @inheritParams extract_column_names #' @inheritParams standardize #' #' @return A data frame comparable to `data`, with adjusted variables. #' #' @examplesIf all(insight::check_if_installed(c("bayestestR", "rstanarm", "gamm4"), quietly = TRUE)) #' adjusted_all <- adjust(attitude) #' head(adjusted_all) #' adjusted_one <- adjust(attitude, effect = "complaints", select = "rating") #' head(adjusted_one) #' \donttest{ #' adjust(attitude, effect = "complaints", select = "rating", bayesian = TRUE) #' adjust(attitude, effect = "complaints", select = "rating", additive = TRUE) #' attitude$complaints_LMH <- cut(attitude$complaints, 3) #' adjust(attitude, effect = "complaints_LMH", select = "rating", multilevel = TRUE) #' } #' #' # Generate data #' data <- bayestestR::simulate_correlation(n = 100, r = 0.7) #' data$V2 <- (5 * data$V2) + 20 # Add intercept #' #' # Adjust #' adjusted <- adjust(data, effect = "V1", select = "V2") #' adjusted_icpt <- adjust(data, effect = "V1", select = "V2", keep_intercept = TRUE) #' #' # Visualize #' plot( #' data$V1, data$V2, #' pch = 19, col = "blue", #' ylim = c(min(adjusted$V2), max(data$V2)), #' main = "Original (blue), adjusted (green), and adjusted - intercept kept (red) data" #' ) #' abline(lm(V2 ~ V1, data = data), col = "blue") #' points(adjusted$V1, adjusted$V2, pch = 19, col = "green") #' abline(lm(V2 ~ V1, data = adjusted), col = "green") #' points(adjusted_icpt$V1, adjusted_icpt$V2, pch = 19, col = "red") #' abline(lm(V2 ~ V1, data = adjusted_icpt), col = "red") #' #' @export adjust <- function( data, effect = NULL, select = is.numeric, exclude = NULL, multilevel = FALSE, additive = FALSE, bayesian = FALSE, keep_intercept = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE ) { # make sure column names are syntactically valid .check_dataframe_names(data, action = "error") # check for formula notation, convert to character vector if (inherits(effect, "formula")) { effect <- all.vars(effect) } # Find predictors if (is.null(effect)) { effect <- names(data) } if (is.null(select)) { select <- is.numeric } select <- .select_nse( select, data, exclude, ignore_case, regex = regex, verbose = verbose ) # Factors formula_random <- NULL facs <- names(data[effect][!vapply(data[effect], is.numeric, logical(1L))]) if (length(facs) >= 1 && multilevel) { if (additive) { formula_random <- stats::as.formula(paste( "~", paste(paste0("(1|", facs, ")"), collapse = " + ") )) } else { formula_random <- paste( "+", paste(paste0("(1|", facs, ")"), collapse = " + ") ) } effect <- effect[!effect %in% facs] } # Fit models out <- data.frame(.ID = seq_len(nrow(data))) for (var in select) { predictors <- effect[effect != var] if (additive) { predictors_num <- names(data[predictors][vapply( data[predictors], is.numeric, logical(1L) )]) predictors[predictors == predictors_num] <- paste0( "s(", predictors_num, ")" ) } formula_predictors <- paste(c("1", predictors), collapse = " + ") model_formula <- paste(var, "~", formula_predictors) x <- .model_adjust_for( data = data[unique(c(var, effect, facs))], model_formula = model_formula, multilevel = multilevel, additive = additive, bayesian = bayesian, formula_random = formula_random, keep_intercept = keep_intercept ) out[var] <- x } out[names(data)[!names(data) %in% names(out)]] <- data[names(data)[ !names(data) %in% names(out) ]] out[names(data)] } #' @rdname adjust #' @export data_adjust <- adjust #' @keywords internal .model_adjust_for <- function( data, model_formula, multilevel = FALSE, additive = FALSE, bayesian = FALSE, formula_random = NULL, keep_intercept = FALSE ) { # Additive ----------------------- if (additive) { # Bayesian if (bayesian) { insight::check_if_installed("rstanarm") model <- rstanarm::stan_gamm4( stats::as.formula(model_formula), random = formula_random, data = data, refresh = 0 ) # Frequentist } else { insight::check_if_installed("gamm4") model <- gamm4::gamm4( stats::as.formula(model_formula), random = formula_random, data = data ) } # Linear ------------------------- } else if (bayesian) { # Bayesian insight::check_if_installed("rstanarm") if (multilevel) { model <- rstanarm::stan_lmer( paste(model_formula, formula_random), data = data, refresh = 0 ) } else { model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0) } } else if (multilevel) { # Frequentist insight::check_if_installed("lme4") model <- lme4::lmer(paste(model_formula, formula_random), data = data) } else { model <- stats::lm(model_formula, data = data) } adjusted <- insight::get_residuals(model) # Re-add intercept if need be if (keep_intercept) { intercept <- insight::get_intercept(model) if (length(intercept) > 1) { intercept <- stats::median(intercept) } # For bayesian model if (is.na(intercept)) { intercept <- 0 } adjusted <- adjusted + intercept } # Deal with missing data out <- rep(NA, nrow(data)) out[stats::complete.cases(data)] <- as.vector(adjusted) out } ================================================ FILE: R/assign_labels.R ================================================ #' @title Assign variable and value labels #' @name assign_labels #' #' @description #' Assign variable and values labels to a variable or variables in a data frame. #' Labels are stored as attributes (`"label"` for variable labels and `"labels"`) #' for value labels. #' #' @param x A data frame, factor or vector. #' @param variable The variable label as string. #' @param values The value labels as (named) character vector. If `values` is #' *not* a named vector, the length of labels must be equal to the length of #' unique values. For a named vector, the left-hand side (LHS) is the value in #' `x`, the right-hand side (RHS) the associated value label. Non-matching #' labels are omitted. #' @param ... Currently not used. #' @inheritParams extract_column_names #' #' @inheritSection center Selection of variables - the `select` argument #' #' @return A labelled variable, or a data frame of labelled variables. #' #' @examples #' x <- 1:3 #' # labelling by providing required number of labels #' assign_labels( #' x, #' variable = "My x", #' values = c("one", "two", "three") #' ) #' #' # labelling using named vectors #' data(iris) #' out <- assign_labels( #' iris$Species, #' variable = "Labelled Species", #' values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3") #' ) #' str(out) #' #' # data frame example #' out <- assign_labels( #' iris, #' select = "Species", #' variable = "Labelled Species", #' values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3") #' ) #' str(out$Species) #' #' # Partial labelling #' x <- 1:5 #' assign_labels( #' x, #' variable = "My x", #' values = c(`1` = "lowest", `5` = "highest") #' ) #' @export assign_labels <- function(x, ...) { UseMethod("assign_labels") } #' @export assign_labels.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( sprintf( "Adding labels currently not possible for variables of class `%s`.", class(x)[1] ) ) } x } #' @rdname assign_labels #' @export assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) { # add variable label if (!is.null(variable)) { if (is.character(variable) && length(variable) == 1L) { attr(x, "label") <- variable } else { insight::format_error( "Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." # nolint ) } } # if user just wants to add a variable label, skip next steps if (!is.null(values)) { # extract unique values unique_values <- as.vector(sort(stats::na.omit(unique(x)))) value_labels <- NULL # do we have a names vector for "values"? # else check if number of labels and values match if (is.null(names(values))) { if (length(values) == length(unique_values)) { value_labels <- stats::setNames(unique_values, values) } else { insight::format_error( "Cannot add labels. Number of unique values and number of value labels are not equal.", sprintf( "There are %i unique values and %i provided labels.", length(unique_values), length(values) ) ) } } else { # check whether we have matches of labels and values matching_labels <- names(values) %in% unique_values if (!all(matching_labels)) { insight::format_error( "Following labels were associated with values that don't exist:", text_concatenate( paste0( values[!matching_labels], " (", names(values)[!matching_labels], ")" ), enclose = "`" ) ) } values <- values[names(values) %in% unique_values] if (length(values)) { # we need to switch names and values value_labels <- stats::setNames( coerce_to_numeric(names(values)), values ) } } attr(x, "labels") <- value_labels } x } #' @export assign_labels.factor <- assign_labels.numeric #' @export assign_labels.character <- assign_labels.numeric #' @rdname assign_labels #' @export assign_labels.data.frame <- function( x, select = NULL, exclude = NULL, values = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) x[select] <- lapply( x[select], assign_labels, values = values, verbose = verbose, ... ) x } ================================================ FILE: R/categorize.R ================================================ #' @title Recode (or "cut" / "bin") data into groups of values. #' @name categorize #' #' @description #' This functions divides the range of variables into intervals and recodes #' the values inside these intervals according to their related interval. #' It is basically a wrapper around base R's `cut()`, providing a simplified #' and more accessible way to define the interval breaks (cut-off values). #' #' @param x A (grouped) data frame, numeric vector or factor. #' @param split Character vector, indicating at which breaks to split variables, #' or numeric values with values indicating breaks. If character, may be one #' of `"median"`, `"mean"`, `"quantile"`, `"equal_length"`, or `"equal_range"`. #' `"median"` or `"mean"` will return dichotomous variables, split at their #' mean or median, respectively. `"quantile"` and `"equal_length"` will split #' the variable into `n_groups` groups, where each group refers to an interval #' of a specific range of values. Thus, the length of each interval will be #' based on the number of groups. `"equal_range"` also splits the variable #' into multiple groups, however, the length of the interval is given, and #' the number of resulting groups (and hence, the number of breaks) will be #' determined by how many intervals can be generated, based on the full range #' of the variable. #' @param n_groups If `split` is `"quantile"` or `"equal_length"`, this defines #' the number of requested groups (i.e. resulting number of levels or values) #' for the recoded variable(s). `"quantile"` will define intervals based #' on the distribution of the variable, while `"equal_length"` tries to #' divide the range of the variable into pieces of equal length. #' @param range If `split = "equal_range"`, this defines the range of values #' that are recoded into a new value. #' @param lowest Minimum value of the recoded variable(s). If `NULL` (the default), #' for numeric variables, the minimum of the original input is preserved. For #' factors, the default minimum is `1`. For `split = "equal_range"`, the #' default minimum is always `1`, unless specified otherwise in `lowest`. #' @param breaks Character, indicating whether breaks for categorizing data are #' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or #' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_ #' group or interval to begin). Use `labels = "range"` to make this behaviour #' easier to see. #' @param labels Character vector of value labels. If not `NULL`, `categorize()` #' will returns factors instead of numeric variables, with `labels` used #' for labelling the factor levels. Can also be `"mean"`, `"median"`, #' `"range"` or `"observed"` for a factor with labels as the mean/median, #' the requested range (even if not all values of that range are present in #' the data) or observed range (range of the actual recoded values) of each #' group. See 'Examples'. #' @param append Logical or string. If `TRUE`, recoded or converted variables #' get new column names and are appended (column bind) to `x`, thus returning #' both the original and the recoded variables. The new columns get a suffix, #' based on the calling function: `"_r"` for recode functions, `"_n"` for #' `to_numeric()`, `"_f"` for `to_factor()`, or `"_s"` for #' `slide()`. If `append=FALSE`, original variables in `x` will be #' overwritten by their recoded versions. If a character value, recoded #' variables are appended with new column names (using the defined suffix) to #' the original data frame. #' @param ... not used. #' @inheritParams extract_column_names #' #' @inherit data_rename seealso #' #' @details #' #' # Splits and breaks (cut-off values) #' #' Breaks are by default _exclusive_, this means that these values indicate #' the lower bound of the next group or interval to begin. Take a simple #' example, a numeric variable with values from 1 to 9. The median would be 5, #' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 #' would turn into 2 (compare `cbind(1:9, categorize(1:9))`). The same variable, #' using `split = "quantile"` and `n_groups = 3` would define breaks at 3.67 #' and 6.33 (see `quantile(1:9, probs = c(1/3, 2/3))`), which means that values #' from 1 to 3 belong to the first interval and are recoded into 1 (because #' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. #' #' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which #' case #' #' # Recoding into groups with equal size or range #' #' `split = "equal_length"` and `split = "equal_range"` try to divide the #' range of `x` into intervals of similar (or same) length. The difference is #' that `split = "equal_length"` will divide the range of `x` into `n_groups` #' pieces and thereby defining the intervals used as breaks (hence, it is #' equivalent to `cut(x, breaks = n_groups)`), while `split = "equal_range"` #' will cut `x` into intervals that all have the length of `range`, where the #' first interval by defaults starts at `1`. The lowest (or starting) value #' of that interval can be defined using the `lowest` argument. #' #' @inheritSection center Selection of variables - the `select` argument #' #' @return `x`, recoded into groups. By default `x` is numeric, unless `labels` #' is specified. In this case, a factor is returned, where the factor levels #' (i.e. recoded groups are labelled accordingly. #' #' @examples #' set.seed(123) #' x <- sample(1:10, size = 50, replace = TRUE) #' #' table(x) #' #' # by default, at median #' table(categorize(x)) #' #' # into 3 groups, based on distribution (quantiles) #' table(categorize(x, split = "quantile", n_groups = 3)) #' #' # into 3 groups, user-defined break #' table(categorize(x, split = c(3, 5))) #' #' set.seed(123) #' x <- sample(1:100, size = 500, replace = TRUE) #' #' # into 5 groups, try to recode into intervals of similar length, #' # i.e. the range within groups is the same for all groups #' table(categorize(x, split = "equal_length", n_groups = 5)) #' #' # into 5 groups, try to return same range within groups #' # i.e. 1-20, 21-40, 41-60, etc. Since the range of "x" is #' # 1-100, and we have a range of 20, this results into 5 #' # groups, and thus is for this particular case identical #' # to the previous result. #' table(categorize(x, split = "equal_range", range = 20)) #' #' # return factor with value labels instead of numeric value #' set.seed(123) #' x <- sample(1:10, size = 30, replace = TRUE) #' categorize(x, "equal_length", n_groups = 3) #' categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")) #' #' # cut numeric into groups with the mean or median as a label name #' x <- sample(1:10, size = 30, replace = TRUE) #' categorize(x, "equal_length", n_groups = 3, labels = "mean") #' categorize(x, "equal_length", n_groups = 3, labels = "median") #' #' # cut numeric into groups with the requested range as a label name #' # each category has the same range, and labels indicate this range #' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") #' # in this example, each category has the same range, but labels only refer #' # to the ranges of the actual values (present in the data) inside each group #' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") #' @export categorize <- function(x, ...) { UseMethod("categorize") } #' @export categorize.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( paste0( "Variables of class `", class(x)[1], "` can't be recoded and remain unchanged." ) ) } x } #' @rdname categorize #' @export categorize.numeric <- function( x, split = "median", n_groups = NULL, range = NULL, lowest = 1, breaks = "exclusive", labels = NULL, verbose = TRUE, ... ) { # sanity check split <- .sanitize_split_arg(split, n_groups, range) # handle aliases if (identical(split, "equal_length")) { split <- "length" } if (identical(split, "equal_range")) { split <- "range" } # check for valid values breaks <- match.arg(breaks, c("exclusive", "inclusive")) # save original_x <- x # no missings x <- stats::na.omit(x) # stop if all NA if (!length(x)) { if (isTRUE(verbose)) { insight::format_alert( "Variable contains only missing values. No recoding carried out." ) } return(original_x) } if (is.numeric(split)) { category_splits <- split } else { category_splits <- switch( split, median = stats::median(x), mean = mean(x), length = n_groups, quantile = stats::quantile(x, probs = seq_len(n_groups) / n_groups), range = .equal_range(x, range, n_groups, lowest), NULL ) } # complete ranges, including minimum and maximum if (!identical(split, "length")) { category_splits <- unique(c(min(x), category_splits, max(x))) } # recode into groups out <- droplevels(cut( x, breaks = category_splits, include.lowest = TRUE, right = identical(breaks, "inclusive") )) cut_result <- out levels(out) <- 1:nlevels(out) # fix lowest value, add back into original vector out <- as.numeric(out) if (!is.null(lowest)) { out <- out - (min(out) - lowest) } original_x[!is.na(original_x)] <- out # turn into factor? .original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...) } #' @export categorize.factor <- function(x, ...) { original_x <- x levels(x) <- 1:nlevels(x) out <- as.factor(categorize(as.numeric(x), ...)) .set_back_labels(out, original_x, include_values = FALSE) } #' @rdname categorize #' @export categorize.data.frame <- function( x, select = NULL, exclude = NULL, split = "median", n_groups = NULL, range = NULL, lowest = 1, breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments my_args <- .process_append( x, select, append, append_suffix = "_r" ) # update processed arguments x <- my_args$x select <- my_args$select } x[select] <- lapply( x[select], categorize, split = split, n_groups = n_groups, range = range, lowest = lowest, breaks = breaks, labels = labels, verbose = verbose, ... ) x } #' @export categorize.grouped_df <- function( x, select = NULL, exclude = NULL, split = "median", n_groups = NULL, range = NULL, lowest = 1, breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { grps <- attr(x, "groups", exact = TRUE)[[".rows"]] attr_data <- attributes(x) # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, remove_group_var = TRUE, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments my_args <- .process_append( x, select, append, append_suffix = "_r" ) # update processed arguments x <- my_args$x select <- my_args$select } x <- as.data.frame(x) for (rows in grps) { x[rows, ] <- categorize( x[rows, , drop = FALSE], split = split, n_groups = n_groups, range = range, lowest = lowest, breaks = breaks, labels = labels, select = select, exclude = exclude, append = FALSE, # need to set to FALSE here, else variable will be doubled ignore_case = ignore_case, verbose = verbose, ... ) } # set back class, so data frame still works with dplyr x <- .replace_attrs(x, attr_data) x } # tools -------------------- .equal_range <- function(x, range, n_groups, lowest = NULL) { if (is.null(lowest)) { lowest <- 1 } if (is.null(range)) { size <- ceiling((max(x) - min(x)) / n_groups) range <- as.numeric(size) } seq(lowest, max(x), by = range) } .sanitize_split_arg <- function(split, n_groups, range) { # check arguments if (is.character(split)) { split <- match.arg( split, choices = c( "median", "mean", "quantile", "equal_length", "equal_range", "equal", "equal_distance", "range", "distance" ) ) } if ( is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups) ) { insight::format_error( "Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified." ) } if ( is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range) ) { insight::format_error( "Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified." ) } split } .original_x_to_factor <- function( original_x, x, cut_result, labels, out, verbose, ... ) { if (!is.null(labels)) { if (length(labels) == length(unique(out))) { original_x <- as.factor(original_x) levels(original_x) <- labels } else if ( length(labels) == 1 && labels %in% c("mean", "median", "range", "observed") ) { original_x <- as.factor(original_x) no_na_x <- original_x[!is.na(original_x)] out <- switch( labels, mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x, median = stats::aggregate( x, list(no_na_x), FUN = stats::median, na.rm = TRUE )$x, # labels basically like what "cut()" returns range = levels(cut_result), # range based on the values that are actually present in the data { temp <- stats::aggregate( x, list(no_na_x), FUN = range, na.rm = TRUE )$x apply(temp, 1, function(i) { paste0("(", paste(as.vector(i), collapse = "-"), ")") }) } ) levels(original_x) <- insight::format_value(out, ...) } else if (isTRUE(verbose)) { insight::format_warning( "Argument `labels` and levels of the recoded variable are not of the same length.", "Variable will not be converted to factor." ) } } original_x } ================================================ FILE: R/center.R ================================================ #' Centering (Grand-Mean Centering) #' #' Performs a grand-mean centering of data. #' #' @param x A (grouped) data frame, a (numeric or character) vector or a factor. #' @param force Logical, if `TRUE`, forces centering of factors as #' well. Factors are converted to numerical values, with the lowest level #' being the value `1` (unless the factor has numeric levels, which are #' converted to the corresponding numeric value). #' @param robust Logical, if `TRUE`, centering is done by subtracting the #' median from the variables. If `FALSE`, variables are centered by #' subtracting the mean. #' @param append Logical or string. If `TRUE`, centered variables get new #' column names (with the suffix `"_c"`) and are appended (column bind) to `x`, #' thus returning both the original and the centered variables. If `FALSE`, #' original variables in `x` will be overwritten by their centered versions. #' If a character value, centered variables are appended with new column #' names (using the defined suffix) to the original data frame. #' @param verbose Toggle warnings and messages. #' @param weights Can be `NULL` (for no weighting), or: #' - For data frames: a numeric vector of weights, or a character of the #' name of a column in the `data.frame` that contains the weights. #' - For numeric vectors: a numeric vector of weights. #' @param center Numeric value, which can be used as alternative to #' `reference` to define a reference centrality. If `center` is of length 1, #' it will be recycled to match the length of selected variables for centering. #' Else, `center` must be of same length as the number of selected variables. #' Values in `center` will be matched to selected variables in the provided #' order, unless a named vector is given. In this case, names are matched #' against the names of the selected variables. #' @param ... Currently not used. #' @inheritParams extract_column_names #' @inheritParams standardize #' #' @section Selection of variables - the `select` argument: #' For most functions that have a `select` argument (including this function), #' the complete input data frame is returned, even when `select` only selects #' a range of variables. That is, the function is only applied to those variables #' that have a match in `select`, while all other variables remain unchanged. #' In other words: for this function, `select` will not omit any non-included #' variables, so that the returned data frame will include all variables #' from the input data frame. #' #' @note #' **Difference between centering and standardizing**: Standardized variables #' are computed by subtracting the mean of the variable and then dividing it by #' the standard deviation, while centering variables involves only the #' subtraction. #' #' @seealso If centering within-clusters (instead of grand-mean centering) #' is required, see [demean()]. For standardizing, see [standardize()], and #' [makepredictcall.dw_transformer()] for use in model formulas. #' #' @return The centered variables. #' #' @examples #' data(iris) #' #' # entire data frame or a vector #' head(iris$Sepal.Width) #' head(center(iris$Sepal.Width)) #' head(center(iris)) #' head(center(iris, force = TRUE)) #' #' # only the selected columns from a data frame #' center(anscombe, select = c("x1", "x3")) #' center(anscombe, exclude = c("x1", "x3")) #' #' # centering with reference center and scale #' d <- data.frame( #' a = c(-2, -1, 0, 1, 2), #' b = c(3, 4, 5, 6, 7) #' ) #' #' # default centering at mean #' center(d) #' #' # centering, using 0 as mean #' center(d, center = 0) #' #' # centering, using -5 as mean #' center(d, center = -5) #' @export center <- function(x, ...) { UseMethod("center") } #' @rdname center #' @export centre <- center #' @export center.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( sprintf( "Centering currently not possible for variables of class `%s`.", class(x)[1] ), "You may open an issue at https://github.com/easystats/datawizard/issues." ) } x } #' @rdname center #' @export center.numeric <- function( x, robust = FALSE, weights = NULL, reference = NULL, center = NULL, verbose = TRUE, ... ) { # set default. Furthermore, data.frame methods cannot return a vector # of NULLs for each variable - instead they return NA. Thus, we have to # treat NA like NULL if (is.null(center) || is.na(center)) { center <- TRUE } my_args <- .process_std_center( x, weights, robust, verbose, reference, center, scale = NULL ) dot_args <- list(...) if (is.null(my_args)) { # all NA? return(x) } else if (is.null(my_args$check)) { vals <- rep(0, length(my_args$vals)) # If only unique value } else { vals <- as.vector(my_args$vals - my_args$center) } centered_x <- rep(NA, length(my_args$valid_x)) centered_x[my_args$valid_x] <- vals attr(centered_x, "center") <- my_args$center attr(centered_x, "scale") <- 1 attr(centered_x, "robust") <- robust # labels z <- .set_back_labels(centered_x, x, include_values = FALSE) # don't add attribute when we call data frame methods if (!isFALSE(dot_args$add_transform_class)) { class(z) <- c("dw_transformer", class(z)) } z } #' @export center.factor <- function( x, robust = FALSE, weights = NULL, force = FALSE, verbose = TRUE, ... ) { if (!force) { return(x) } center( .factor_to_numeric(x), weights = weights, robust = robust, verbose = verbose, ... ) } #' @export center.logical <- center.factor #' @export center.character <- center.factor #' @export center.Date <- center.factor #' @export center.AsIs <- center.numeric #' @rdname center #' @inheritParams standardize.data.frame #' @export center.data.frame <- function( x, select = NULL, exclude = NULL, robust = FALSE, weights = NULL, reference = NULL, center = NULL, force = FALSE, remove_na = c("none", "selected", "all"), append = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # process arguments my_args <- .process_std_args( x, select, exclude, weights, append, append_suffix = "_c", keep_factors = force, remove_na, reference, .center = center, .scale = NULL ) # set new values x <- my_args$x for (var in my_args$select) { x[[var]] <- center( x[[var]], robust = robust, weights = my_args$weights, verbose = FALSE, reference = reference[[var]], center = my_args$center[var], force = force, add_transform_class = FALSE ) } attr(x, "center") <- vapply( x[my_args$select], function(z) attributes(z)$center, numeric(1) ) attr(x, "scale") <- vapply( x[my_args$select], function(z) attributes(z)$scale, numeric(1) ) attr(x, "robust") <- robust x } #' @export center.grouped_df <- function( x, select = NULL, exclude = NULL, robust = FALSE, weights = NULL, reference = NULL, center = NULL, force = FALSE, remove_na = c("none", "selected", "all"), append = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) my_args <- .process_grouped_df( x, select, exclude, append, append_suffix = "_c", reference, weights, keep_factors = force ) for (rows in my_args$grps) { my_args$x[rows, ] <- center( my_args$x[rows, , drop = FALSE], select = my_args$select, exclude = NULL, robust = robust, weights = my_args$weights, remove_na = remove_na, verbose = verbose, force = force, append = FALSE, center = center, add_transform_class = FALSE, ... ) } # set back class, so data frame still works with dplyr attributes(my_args$x) <- my_args$info my_args$x } # methods ------------------------- #' @export print.dw_transformer <- function(x, ...) { print(as.vector(x), ...) vector_info <- NULL if (!is.null(attributes(x)$scale)) { # attributes for center() / standardize() vector_info <- sprintf( "(center: %.2g, scale = %.2g)\n", attributes(x)$center, attributes(x)$scale ) } else if (!is.null(attributes(x)$range_difference)) { # attributes for normalize() / rescale() vector_info <- sprintf( "(original range = %.2g to %.2g)\n", attributes(x)$min_value, attributes(x)$min_value + attributes(x)$range_difference ) } if (!is.null(vector_info)) { insight::print_color(vector_info, color = "grey") } invisible(x) } ================================================ FILE: R/contrs.R ================================================ #' Deviation Contrast Matrix #' #' Build a deviation contrast matrix, a type of _effects contrast_ matrix. #' #' @inheritParams stats::contr.sum #' #' @details #' In effects coding, unlike treatment/dummy coding #' ([stats::contr.treatment()]), each contrast sums to 0. In regressions models, #' this results in an intercept that represents the (unweighted) average of the #' group means. In ANOVA settings, this also guarantees that lower order effects #' represent _main_ effects (and not _simple_ or _conditional_ effects, as is #' the case when using R's default [stats::contr.treatment()]). #' \cr\cr #' Deviation coding (`contr.deviation`) is a type of effects coding. With #' deviation coding, the coefficients for factor variables are interpreted as #' the difference of each factor level from the base level (this is the same #' interpretation as with treatment/dummy coding). For example, for a factor #' `group` with levels "A", "B", and "C", with `contr.devation`, the intercept #' represents the overall mean (average of the group means for the 3 groups), #' and the coefficients `groupB` and `groupC` represent the differences between #' the A group mean and the B and C group means, respectively. #' \cr\cr #' Sum coding ([stats::contr.sum()]) is another type of effects coding. With sum #' coding, the coefficients for factor variables are interpreted as the #' difference of each factor level from **the grand (across-groups) mean**. For #' example, for a factor `group` with levels "A", "B", and "C", with #' `contr.sum`, the intercept represents the overall mean (average of the group #' means for the 3 groups), and the coefficients `group1` and `group2` represent #' the differences the #' **A** and **B** group means from the overall mean, respectively. #' #' @seealso [stats::contr.sum()] #' #' @examplesIf !identical(Sys.getenv("IN_PKGDOWN"), "true") #' \donttest{ #' data("mtcars") #' #' mtcars <- data_modify(mtcars, cyl = factor(cyl)) #' #' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl)) #' solve(c.treatment) #' #> 4 6 8 #' #> Intercept 1 0 0 # mean of the 1st level #' #> 6 -1 1 0 # 2nd level - 1st level #' #> 8 -1 0 1 # 3rd level - 1st level #' #' contrasts(mtcars$cyl) <- contr.sum #' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl)) #' solve(c.sum) #' #> 4 6 8 #' #> Intercept 0.333 0.333 0.333 # overall mean #' #> 0.667 -0.333 -0.333 # deviation of 1st from overall mean #' #> -0.333 0.667 -0.333 # deviation of 2nd from overall mean #' #' #' contrasts(mtcars$cyl) <- contr.deviation #' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl)) #' solve(c.deviation) #' #> 4 6 8 #' #> Intercept 0.333 0.333 0.333 # overall mean #' #> 6 -1.000 1.000 0.000 # 2nd level - 1st level #' #> 8 -1.000 0.000 1.000 # 3rd level - 1st level #' #' ## With Interactions ----------------------------------------- #' mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation)) #' mtcars <- data_arrange(mtcars, select = c("cyl", "am")) #' #' mm <- unique(model.matrix(~ cyl * am, data = mtcars)) #' rownames(mm) <- c( #' "cyl4.am0", "cyl4.am1", "cyl6.am0", #' "cyl6.am1", "cyl8.am0", "cyl8.am1" #' ) #' #' solve(mm) #' #> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1 #' #> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean #' #> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st #' #> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st #' #> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff #' #> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000 #' #> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000 #' } #' #' @export contr.deviation <- function(n, base = 1, contrasts = TRUE, sparse = FALSE) { cont <- stats::contr.treatment( n, base = base, contrasts = contrasts, sparse = sparse ) if (contrasts) { n <- nrow(cont) cont <- cont - 1 / n } cont } ================================================ FILE: R/convert_na_to.R ================================================ #' @title Replace missing values in a variable or a data frame. #' @name convert_na_to #' #' @description #' Replace missing values in a variable or a data frame. #' #' @param x A numeric, factor, or character vector, or a data frame. #' @param replacement Numeric or character value that will be used to #' replace `NA`. #' @param verbose Toggle warnings. #' @param ... Not used. #' #' @inheritSection center Selection of variables - the `select` argument #' #' @return #' `x`, where `NA` values are replaced by `replacement`. #' #' @examples #' # Convert NA to 0 in a numeric vector #' convert_na_to( #' c(9, 3, NA, 2, 3, 1, NA, 8), #' replacement = 0 #' ) #' #' # Convert NA to "missing" in a character vector #' convert_na_to( #' c("a", NA, "d", "z", NA, "t"), #' replacement = "missing" #' ) #' #' ### For data frames #' #' test_df <- data.frame( #' x = c(1, 2, NA), #' x2 = c(4, 5, NA), #' y = c("a", "b", NA) #' ) #' #' # Convert all NA to 0 in numeric variables, and all NA to "missing" in #' # character variables #' convert_na_to( #' test_df, #' replace_num = 0, #' replace_char = "missing" #' ) #' #' # Convert a specific variable in the data frame #' convert_na_to( #' test_df, #' replace_num = 0, #' replace_char = "missing", #' select = "x" #' ) #' #' # Convert all variables starting with "x" #' convert_na_to( #' test_df, #' replace_num = 0, #' replace_char = "missing", #' select = starts_with("x") #' ) #' #' # Convert NA to 1 in variable 'x2' and to 0 in all other numeric #' # variables #' convert_na_to( #' test_df, #' replace_num = 0, #' select = list(x2 = 1) #' ) #' #' @export convert_na_to <- function(x, ...) { UseMethod("convert_na_to") } #' @export convert_na_to.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( sprintf( "Converting missing values (`NA`) into regular values currently not possible for variables of class `%s`.", class(x)[1] ) ) } x } #' @rdname convert_na_to #' @export convert_na_to.numeric <- function(x, replacement = NULL, verbose = TRUE, ...) { if (insight::is_empty_object(replacement) || !is.numeric(replacement)) { if (isTRUE(verbose)) { insight::format_warning("`replacement` needs to be a numeric vector.") } } else if (length(replacement) > 1) { if (isTRUE(verbose)) { insight::format_warning("`replacement` needs to be of length one.") } } else { x[is.na(x)] <- replacement } x } #' @export convert_na_to.factor <- function(x, replacement = NULL, verbose = TRUE, ...) { if (insight::is_empty_object(replacement) || length(replacement) > 1) { if (isTRUE(verbose)) { insight::format_warning("`replacement` needs to be of length one.") } } else { x <- addNA(x) levels(x) <- c(levels(x), replacement) x[is.na(x)] <- replacement } x } #' @rdname convert_na_to #' @export convert_na_to.character <- function( x, replacement = NULL, verbose = TRUE, ... ) { if ( insight::is_empty_object(replacement) || !is.character(replacement) && !is.numeric(replacement) ) { if (isTRUE(verbose)) { insight::format_warning( "`replacement` needs to be a character or numeric vector." ) } } else if (length(replacement) > 1) { if (isTRUE(verbose)) { insight::format_warning("`replacement` needs to be of length one.") } } else { x[is.na(x)] <- as.character(replacement) } x } #' @param replace_num Value to replace `NA` when variable is of type numeric. #' @param replace_char Value to replace `NA` when variable is of type character. #' @param replace_fac Value to replace `NA` when variable is of type factor. #' @inheritParams extract_column_names #' #' @rdname convert_na_to #' @export convert_na_to.data.frame <- function( x, select = NULL, exclude = NULL, replacement = NULL, replace_num = replacement, replace_char = replacement, replace_fac = replacement, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { my_data <- x select_nse <- .select_nse( select, data = my_data, exclude = exclude, ignore_case, regex = regex, verbose = verbose ) # default lookup <- lapply(x, function(y) { if (is.numeric(y)) { replace_num } else if (is.character(y)) { replace_char } else if (is.factor(y)) { replace_fac } }) # override for specific vars try_eval <- try(eval(select), silent = TRUE) select_is_list <- !inherits(try_eval, "try-error") && is.list(select) if (select_is_list) { for (i in select_nse) { lookup[[i]] <- select[[i]] } } else { lookup <- lookup[names(lookup) %in% select_nse] } lookup <- Filter(Negate(is.null), lookup) for (i in names(lookup)) { x[[i]] <- convert_na_to( x[[i]], replacement = lookup[[i]], verbose = verbose ) } x } ================================================ FILE: R/convert_to_na.R ================================================ #' @title Convert non-missing values in a variable into missing values. #' @name convert_to_na #' #' @description #' Convert non-missing values in a variable into missing values. #' #' @param x A vector, factor or a data frame. #' @param na Numeric, character vector or logical (or a list of numeric, character #' vectors or logicals) with values that should be converted to `NA`. Numeric #' values applied to numeric vectors, character values are used for factors, #' character vectors or date variables, and logical values for logical vectors. #' @param drop_levels Logical, for factors, when specific levels are replaced #' by `NA`, should unused levels be dropped? #' @param ... Not used. #' @inheritParams extract_column_names #' #' @return #' `x`, where all values in `na` are converted to `NA`. #' #' @examples #' x <- sample(1:6, size = 30, replace = TRUE) #' x #' # values 4 and 5 to NA #' convert_to_na(x, na = 4:5) #' #' # data frames #' set.seed(123) #' x <- data.frame( #' a = sample(1:6, size = 20, replace = TRUE), #' b = sample(letters[1:6], size = 20, replace = TRUE), #' c = sample(c(30:33, 99), size = 20, replace = TRUE) #' ) #' # for all numerics, convert 5 to NA. Character/factor will be ignored. #' convert_to_na(x, na = 5) #' #' # for numerics, 5 to NA, for character/factor, "f" to NA #' convert_to_na(x, na = list(6, "f")) #' #' # select specific variables #' convert_to_na(x, select = c("a", "b"), na = list(6, "f")) #' @export convert_to_na <- function(x, ...) { UseMethod("convert_to_na") } #' @export convert_to_na.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( sprintf( "Converting values into missing values (`NA`) currently not possible for variables of class `%s`.", class(x)[1] ) ) } x } #' @rdname convert_to_na #' @export convert_to_na.numeric <- function(x, na = NULL, verbose = TRUE, ...) { # if we have a list, use first valid element if (is.list(na)) { na <- unlist( na[vapply(na, is.numeric, FUN.VALUE = TRUE)], use.names = FALSE ) } if (insight::is_empty_object(na) || !is.numeric(na)) { if (isTRUE(verbose)) { insight::format_alert( "Could not convert values into `NA` for a numeric variable.", "To do this, `na` needs to be a numeric vector, or a list that contains numeric vector elements." ) } } else { matches <- which(x %in% na) x[matches] <- NA # drop unused labels value_labels <- attr(x, "labels", exact = TRUE) attr(x, "labels") <- value_labels[!value_labels %in% na] } x } #' @rdname convert_to_na #' @export convert_to_na.factor <- function( x, na = NULL, drop_levels = FALSE, verbose = TRUE, ... ) { # if we have a list, use first valid element if (is.list(na)) { na <- unlist( na[vapply(na, is.character, FUN.VALUE = TRUE)], use.names = FALSE ) } if (insight::is_empty_object(na) || (!is.factor(na) && !is.character(na))) { if (isTRUE(verbose)) { insight::format_alert( "Could not convert values into `NA` for a factor or character variable.", "To do this, `na` needs to be a character vector, or a list that contains character vector elements." ) } } else { matches <- which(x %in% na) x[matches] <- NA # drop unused labels value_labels <- attr(x, "labels", exact = TRUE) if (is.factor(x) && isTRUE(drop_levels)) { # save label attribute variable_label <- attr(x, "label", exact = TRUE) x <- droplevels(x) # droplevels() discards attributes, so we need to re-assign them attr(x, "label") <- variable_label } attr(x, "labels") <- value_labels[!value_labels %in% na] } x } #' @export convert_to_na.character <- convert_to_na.factor #' @export convert_to_na.Date <- function(x, na = NULL, verbose = TRUE, ...) { # if we have a list, use first valid element if (is.list(na)) { na <- na[vapply(na, .is_date, FUN.VALUE = logical(1L))] if (length(na) > 1) { na <- na[[1]] } } if (insight::is_empty_object(na) || !.is_date(na)) { if (isTRUE(verbose)) { insight::format_alert( "Could not convert values into `NA` for a date/time variable.", "To do this, `na` must be of class 'Date'." ) } } else { matches <- which(x == na) x[matches] <- NA } x } #' @export convert_to_na.logical <- function(x, na = NULL, verbose = TRUE, ...) { # if we have a list, use first valid element if (is.list(na)) { na <- unlist( na[vapply(na, is.logical, FUN.VALUE = TRUE)], use.names = FALSE ) } if (insight::is_empty_object(na) || !is.logical(na)) { if (isTRUE(verbose)) { insight::format_alert( "Could not convert values into `NA` for a logical variable.", "To do this, `na` needs to be a logical vector, or a list that contains logical vector elements." ) } } else { matches <- which(x == na) x[matches] <- NA } x } #' @rdname convert_to_na #' @export convert_to_na.data.frame <- function( x, select = NULL, exclude = NULL, na = NULL, drop_levels = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) x[select] <- lapply( x[select], convert_to_na, na = na, drop_levels = drop_levels, verbose = verbose, ... ) x } ================================================ FILE: R/data.R ================================================ #' @docType data #' @title Sample dataset from the National Health and Nutrition Examination Survey #' @name nhanes_sample #' @keywords data #' #' @description Selected variables from the National Health and Nutrition Examination #' Survey that are used in the example from Lumley (2010), Appendix E. #' #' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley NULL #' @docType data #' @title Sample dataset from the EFC Survey #' @name efc #' @keywords data #' #' @description Selected variables from the EUROFAMCARE survey. Useful when #' testing on "real-life" data sets, including random missing values. This #' data set also has value and variable label attributes. NULL ================================================ FILE: R/data_addprefix.R ================================================ #' Add a prefix or suffix to column names #' #' @rdname data_prefix_suffix #' @inheritParams extract_column_names #' @param pattern A character string, which will be added as prefix or suffix #' to the column names. #' @param ... Other arguments passed to or from other functions. #' #' @seealso #' [data_rename()] for more fine-grained column renaming. #' @examples #' # Add prefix / suffix to all columns #' head(data_addprefix(iris, "NEW_")) #' head(data_addsuffix(iris, "_OLD")) #' #' @export data_addprefix <- function( data, pattern, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, data, exclude, ignore_case, regex = regex, verbose = verbose ) selected_columns <- colnames(data) %in% select colnames(data)[selected_columns] <- paste0( pattern, colnames(data)[selected_columns] ) data } #' @rdname data_prefix_suffix #' @export data_addsuffix <- function( data, pattern, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, data, exclude, ignore_case, regex = regex, verbose = verbose ) selected_columns <- colnames(data) %in% select colnames(data)[selected_columns] <- paste0( colnames(data)[selected_columns], pattern ) data } ================================================ FILE: R/data_arrange.R ================================================ #' Arrange rows by column values #' #' `data_arrange()` orders the rows of a data frame by the values of selected #' columns. #' #' @param data A data frame, or an object that can be coerced to a data frame. #' @param select Character vector of column names. Use a dash just before column #' name to arrange in decreasing order, for example `"-x1"`. #' @param safe Do not throw an error if one of the variables specified doesn't #' exist. #' #' @return A data frame. #' #' @examples #' #' # Arrange using several variables #' data_arrange(head(mtcars), c("gear", "carb")) #' #' # Arrange in decreasing order #' data_arrange(head(mtcars), "-carb") #' #' # Throw an error if one of the variables specified doesn't exist #' try(data_arrange(head(mtcars), c("gear", "foo"), safe = FALSE)) #' @export data_arrange <- function(data, select = NULL, safe = TRUE) { UseMethod("data_arrange") } #' @export data_arrange.default <- function(data, select = NULL, safe = TRUE) { if (is.null(select) || length(select) == 0) { return(data) } original_x <- data # Input validation check data <- .coerce_to_dataframe(data) # Remove tidyverse attributes, will add them back at the end if (inherits(original_x, "tbl_df")) { tbl_input <- TRUE data <- as.data.frame(data, stringsAsFactors = FALSE) } else { tbl_input <- FALSE } # find which vars should be decreasing desc <- select[startsWith(select, "-")] desc <- gsub("^-", "", desc) select <- gsub("^-", "", select) # check for variables that are not in data dont_exist <- setdiff(select, colnames(data)) if (length(dont_exist) > 0) { if (safe) { insight::format_warning( paste0( "The following column(s) don't exist in the dataset: ", text_concatenate(dont_exist), "." ), .misspelled_string(names(data), dont_exist, "Possibly misspelled?") ) } else { insight::format_error( paste0( "The following column(s) don't exist in the dataset: ", text_concatenate(dont_exist), "." ), .misspelled_string(names(data), dont_exist, "Possibly misspelled?") ) } select <- select[-which(select %in% dont_exist)] } if (length(select) == 0) { return(data) } already_sorted <- all(vapply( data[, select, drop = FALSE], .is_sorted, logical(1L) )) if (isTRUE(already_sorted)) { return(data) } out <- data # reverse order for variables that should be decreasing if (length(desc) > 0) { for (i in desc) { out[[i]] <- -xtfrm(out[[i]]) } } # apply ordering if (length(select) == 1) { out <- data[order(out[[select]]), , drop = FALSE] } else { out <- data[do.call(order, out[, select]), , drop = FALSE] } if (!insight::object_has_rownames(data)) { rownames(out) <- NULL } # add back custom attributes out <- .replace_attrs(out, attributes(original_x)) out } #' @export data_arrange.grouped_df <- function(data, select = NULL, safe = TRUE) { original_x <- data grps <- attr(data, "groups", exact = TRUE) grps <- grps[[".rows"]] # Remove tidyverse attributes, will add them back at the end if (inherits(data, "tbl_df")) { tbl_input <- TRUE data <- as.data.frame(data, stringsAsFactors = FALSE) } else { tbl_input <- FALSE } out <- lapply(grps, function(x) { data_arrange.default(data[x, ], select = select, safe = safe) }) out <- do.call(rbind, out) if (!insight::object_has_rownames(data)) { rownames(out) <- NULL } # add back tidyverse attributes if (isTRUE(tbl_input)) { class(out) <- c("tbl_df", "tbl", "data.frame") } # add back custom attributes out <- .replace_attrs(out, attributes(original_x)) out } ================================================ FILE: R/data_codebook.R ================================================ #' Generate a codebook of a data frame. #' #' `data_codebook()` generates codebooks from data frames, i.e. overviews #' of all variables and some more information about each variable (like #' labels, values or value range, frequencies, amount of missing values). #' #' @param data A data frame, or an object that can be coerced to a data frame. #' @param variable_label_width Length of variable labels. Longer labels will be #' wrapped at `variable_label_width` chars. If `NULL`, longer labels will not #' be split into multiple lines. Only applies to _labelled data_. #' @param value_label_width Length of value labels. Longer labels will be #' shortened, where the remaining part is truncated. Only applies to #' _labelled data_ or factor levels. #' @param range_at Indicates how many unique values in a numeric vector are #' needed in order to print a range for that variable instead of a frequency #' table for all numeric values. Can be useful if the data contains numeric #' variables with only a few unique values and where full frequency tables #' instead of value ranges should be displayed. #' @param max_values Number of maximum values that should be displayed. Can be #' used to avoid too many rows when variables have lots of unique values. #' @param font_size For HTML tables, the font size. #' @param line_padding For HTML tables, the distance (in pixel) between lines. #' @param row_color For HTML tables, the fill color for odd rows. #' @inheritParams standardize.data.frame #' @inheritParams extract_column_names #' @inheritParams data_tabulate #' #' @return A formatted data frame, summarizing the content of the data frame. #' Returned columns include the column index of the variables in the original #' data frame (`ID`), column name, variable label (if data is labelled), type #' of variable, number of missing values, unique values (or value range), #' value labels (for labelled data), and a frequency table (N for each value). #' Most columns are formatted as character vectors. #' #' @note There are methods to `print()` the data frame in a nicer output, as #' well methods for printing in markdown or HTML format (`print_md()` and #' `print_html()`). The `print()` method for text outputs passes arguments in #' `...` to [`insight::export_table()`]. #' #' @examples #' data(iris) #' data_codebook(iris, select = starts_with("Sepal")) #' #' data(efc) #' data_codebook(efc) #' #' # shorten labels #' data_codebook(efc, variable_label_width = 20, value_label_width = 15) #' #' # automatic range for numerics at more than 5 unique values #' data(mtcars) #' data_codebook(mtcars, select = starts_with("c")) #' #' # force all values to be displayed #' data_codebook(mtcars, select = starts_with("c"), range_at = 100) #' @export data_codebook <- function( data, select = NULL, exclude = NULL, variable_label_width = NULL, value_label_width = NULL, max_values = 10, range_at = 6, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { data_name <- insight::safe_deparse(substitute(data)) # evaluate select/exclude, may be select-helpers select <- .select_nse( select, data, exclude, ignore_case, regex = regex, verbose = verbose ) # check for emtpy columns, and remove empty <- empty_columns(data[select]) if (length(empty)) { if (verbose) { insight::format_warning( sprintf( "Following %i columns were empty and have been removed:", length(empty) ), text_concatenate(names(empty)) ) } select <- select[-empty] } # check if any columns left, or found if (!length(select) || is.null(select)) { if (isTRUE(verbose)) { insight::format_warning( "No column names that matched the required search pattern were found." ) } return(NULL) } # needed for % NA rows <- nrow(data) max_values <- max_values + 1 out <- lapply(seq_along(select), function(id) { # variable x <- data[[select[id]]] x_na <- is.na(x) x_inf <- is.infinite(x) # inital data frame for codebook d <- data.frame( ID = which(colnames(data) == select[id]), Name = select[id], Type = .variable_type(x), Missings = sprintf("%g (%.1f%%)", sum(x_na), 100 * (sum(x_na) / rows)), stringsAsFactors = FALSE, row.names = NULL, check.names = FALSE ) # check if there are variable labels variable_label <- .extract_variable_labels(x, variable_label_width) # we may need to remove duplicated value range elements flag_range <- FALSE # save value labels vallab <- attr(x, "labels", exact = TRUE) # do we have labelled NA values? If so, include labelled NAs in count table # we do this by converting NA values into character strings if (anyNA(vallab) && insight::check_if_installed("haven", quietly = TRUE)) { # get na-tags, i.e. the value labels for the different NA values na_labels <- haven::na_tag(vallab) # replace NA in labels with NA tags vallab[!is.na(na_labels)] <- stats::setNames( paste0("NA(", na_labels[!is.na(na_labels)], ")"), names(vallab[!is.na(na_labels)]) ) # replace tagged NAs in variable with their values, tagged as NA(value) na_values <- haven::na_tag(x) # need to convert, we still have haven-class, which cannot coerce x <- as.character(x) x[!is.na(na_values)] <- paste0("NA(", na_values[!is.na(na_values)], ")") # update information on NA - we still might have non-labelled (regular) NA x_na <- is.na(x) } # remove NA and Inf, for tabulate(). as.factor() will convert NaN # to a factor level "NaN", which we don't want here (same for Inf), # because tabulate() will then return frequencies for that level, too x <- x[!(x_na | x_inf)] # get unique values, to remove non labelled data unique_values <- unique(x) # coerce to factor, for tabulate(). We will coerce numerics to factor later # which is required because tabulate() doesn't return frequencies for values # lower than 1 if (!is.numeric(x) && !is.factor(x)) { x <- as.factor(x) } # for ranges, we don't want the N% value, so use this to flag range-values is_range <- FALSE # handle labelled data - check if there are value labels or factor levels, # and extract values and N if (!is.null(vallab) && length(vallab)) { # if not all values are labelled, fill in value labels if (!all(unique_values %in% vallab)) { new_vals <- setdiff(unique_values, vallab) vallab <- c(vallab, stats::setNames(new_vals, new_vals)) } # if not all value labels are present in the data, remove unused value labels if (!all(vallab %in% unique_values)) { not_needed <- setdiff(vallab, unique_values) # match not needed values in vallab vector - values from labels # may not be in sorted order (e.g. 1, 2, 3, -9), or may be character # vectors in case of tagged NAs, so we have to make sure we know which # values can be removed from vallab not_needed <- stats::na.omit(match(not_needed, vallab)) vallab <- vallab[-not_needed] } # we now should have the same length of value labels and labelled values # which should also match the numberof unique values in the vector. # "tabulate" creates frequency tables by sorting by values/levels, so # we need to make sure that labels are also in sorted order. value_labels <- names(vallab)[order(unname(vallab))] values <- sort(unname(vallab)) frq <- tabulate(as.factor(x)) # handle factors } else if (is.factor(x)) { values <- levels(x) value_labels <- NA frq <- tabulate(x) # handle numerics } else { value_labels <- NA # only range for too many unique values if (length(unique_values) >= range_at) { r <- range(x, na.rm = TRUE) values <- sprintf("[%g, %g]", round(r[1], 2), round(r[2], 2)) frq <- sum(!x_na) flag_range <- length(variable_label) > 1 is_range <- TRUE # if we have few values, we can print whole freq. table } else { values <- sort(unique_values) frq <- tabulate(as.factor(x)) } } # tabulate fills 0 for non-existing values, remove those frq <- frq[frq != 0] # add Inf values? if (any(x_inf) && length(frq) <= max_values) { values <- c(values, Inf) if (!is.na(value_labels)) { value_labels <- c(value_labels, "infinite") } frq <- c(frq, sum(x_inf)) # Inf are added as value, so don't flag range any more, # since we now have proportions for the range and the inf values. is_range <- FALSE } # add proportions, but not for ranges, since these are always 100% if (is_range) { frq_proportions <- "" } else { frq_proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1)) } # make sure we have not too long rows, e.g. for variables that # have dozens of unique values if (length(value_labels) > max_values) { value_labels <- value_labels[1:max_values] value_labels[max_values] <- "(...)" } if (length(frq) > max_values) { frq <- frq[1:max_values] frq_proportions <- frq_proportions[1:max_values] frq[max_values] <- NA frq_proportions[max_values] <- NA } if (length(values) > max_values) { values <- values[1:max_values] values[max_values] <- "(...)" } # make sure length recycling doesn't fail, e.g. if we have split # variable_label into two lines (i.e. vector of length 2), but we have # 7 values in "frq", creating the data frame will fail. In this case, # we have to make sure that recycling shorter vectors works. if (length(variable_label) > 1 && !flag_range) { variable_label <- variable_label[seq_along(frq)] } # shorten value labels if (!is.null(value_label_width)) { value_labels <- insight::format_string( value_labels, length = value_label_width ) } # add values, value labels and frequencies to data frame d <- cbind( d, data.frame( variable_label, values, value_labels, frq, proportions = frq_proportions, stringsAsFactors = FALSE ) ) # which columns need to be checked for duplicates? duplicates <- c("ID", "Name", "Type", "Missings", "variable_label") if (isTRUE(flag_range)) { # when we have numeric variables with value range as values, and when # these variables had long variable labels that have been wrapped, # the range value is duplicated (due to recycling), so we need to fix # this here. duplicates <- c(duplicates, c("values", "frq", "proportions")) } # clear duplicates due to recycling for (i in duplicates) { d[[i]][duplicated(d[[i]])] <- "" } # remove empty rows d <- remove_empty_rows(d) # add empty row at the end, as separator d[nrow(d) + 1, ] <- rep("", ncol(d)) # add row ID d$.row_id <- id d }) # clean-up (column order, rename, ...) out <- .finalize_result(do.call(rbind, out)) # add attributes .add_codebook_attributes(out, data_name, data, select) } # helper ----------------------- #' @keywords internal .extract_variable_labels <- function(x, variable_label_width = NULL) { varlab <- attr(x, "label", exact = TRUE) if (!is.null(varlab) && length(varlab)) { variable_label <- varlab # if variable labels are too long, split into multiple elements if ( !is.null(variable_label_width) && nchar(variable_label) > variable_label_width ) { variable_label <- insight::trim_ws(unlist( strsplit( text_wrap(variable_label, width = variable_label_width), "\n", fixed = TRUE ), use.names = FALSE )) } } else { variable_label <- NA } variable_label } #' @keywords internal .finalize_result <- function(out) { # rename pattern <- c("variable_label", "values", "value_labels", "frq", "proportions") replacement <- c("Label", "Values", "Value Labels", "N", "Prop") for (i in seq_along(pattern)) { names(out) <- replace(names(out), names(out) == pattern[i], replacement[i]) } # remove all empty columns out <- remove_empty_columns(out) # reorder column_order <- c( "ID", "Name", "Label", "Type", "Missings", "Values", "Value Labels", "N", "Prop", ".row_id" ) out[union(intersect(column_order, names(out)), names(out))] } #' @keywords internal .add_codebook_attributes <- function(out, data_name, data, select) { attr(out, "data_name") <- data_name attr(out, "n_rows") <- nrow(data) attr(out, "n_cols") <- ncol(data) attr(out, "n_shown") <- length(select) class(out) <- c("data_codebook", "data.frame") out } # methods ---------------------- #' @export format.data_codebook <- function(x, format = "text", ...) { # use [["N"]] to avoid partial matching if (any(stats::na.omit(nchar(x[["N"]]) > 5))) { x[["N"]] <- insight::trim_ws(prettyNum(x[["N"]], big.mark = ",")) x[["N"]][x[["N"]] == "NA" | is.na(x[["N"]])] <- "" } # merge N and % if (!is.null(x$Prop)) { x$Prop[x$Prop == "NA" | is.na(x$Prop)] <- "" # align only for text format if (identical(format, "text")) { x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right") # nolint } x[["N"]][x$Prop != ""] <- sprintf( # nolint "%s (%s)", as.character(x[["N"]][x$Prop != ""]), # nolint x$Prop[x$Prop != ""] # nolint ) x$Prop <- NULL } x } #' @export print.data_codebook <- function(x, ...) { caption <- c(.get_codebook_caption(x), "blue") x$.row_id <- NULL cat( insight::export_table( format(x), title = caption, empty_line = "-", cross = "+", align = .get_codebook_align(x), ... ) ) } #' @rdname data_codebook #' @export print_html.data_codebook <- function( x, font_size = "100%", line_padding = 3, row_color = "#eeeeee", ... ) { caption <- .get_codebook_caption(x) attr(x, "table_caption") <- caption # since we have each value at its own row, the HTML table contains # horizontal borders for each cell/row. We want to remove those borders # from rows that actually belong to one variable separator_lines <- which(duplicated(x$.row_id) & x$N == "") # nolint # remove separator lines, as we don't need these for HTML tables x <- x[-separator_lines, ] # check row IDs, and find odd rows odd_rows <- (x$.row_id %% 2 == 1) x$.row_id <- NULL # create basic table backend <- .check_format_backend(...) out <- insight::export_table( format(x, format = "html"), title = caption, format = backend, align = .get_codebook_align(x) ) # for tiny table output, we don't need to do any further formatting if (identical(backend, "tt")) { return(out) } insight::check_if_installed("gt") # no border for rows which are not separator lines out <- gt::tab_style( out, style = list(gt::cell_borders(sides = "top", style = "hidden")), locations = gt::cells_body(rows = which(x$ID == "")) # nolint ) # highlight odd rows if (!is.null(row_color)) { out <- gt::tab_style( out, style = list(gt::cell_fill(color = row_color)), locations = gt::cells_body(rows = odd_rows) ) } # set up additonal HTML options gt::tab_options( out, table.font.size = font_size, data_row.padding = gt::px(line_padding) ) } #' @rdname data_codebook #' @export display.data_codebook <- function( object, format = "markdown", font_size = "100%", line_padding = 3, row_color = "#eeeeee", ... ) { format <- .display_default_format(format) fun_args <- list( x = object, font_size = font_size, line_padding = line_padding, row_color = row_color, ... ) # print table in HTML or markdown format if (format %in% c("html", "tt")) { fun_args$backend <- format do.call(print_html, fun_args) } else { do.call(print_md, fun_args) } } #' @export print_md.data_codebook <- function(x, ...) { caption <- .get_codebook_caption(x) x$.row_id <- NULL attr(x, "table_caption") <- caption insight::export_table( format(x, format = "markdown"), title = caption, align = .get_codebook_align(x), format = "markdown" ) } # helper --------- .get_codebook_caption <- function(x) { n_rows <- as.character(attributes(x)$n_rows) if (nchar(n_rows) > 5) { n_rows <- prettyNum(n_rows, big.mark = ",") } sprintf( "%s (%s rows and %i variables, %i shown)", attributes(x)$data_name, n_rows, attributes(x)$n_cols, attributes(x)$n_shown ) } .get_codebook_align <- function(x) { # need to remove this one x$Prop <- NULL align <- c( ID = "l", Name = "l", Label = "l", Type = "l", Missings = "r", Values = "r", `Value Labels` = "l", N = "r" ) align <- align[colnames(x)] paste(unname(align), collapse = "") } ================================================ FILE: R/data_duplicated.R ================================================ #' @title Extract all duplicates #' #' @description Extract all duplicates, for visual inspection. #' Note that it also contains the first occurrence of future #' duplicates, unlike [duplicated()] or [dplyr::distinct()]). Also #' contains an additional column reporting the number of missing #' values for that row, to help in the decision-making when #' selecting which duplicates to keep. #' #' @inheritParams extract_column_names #' #' @keywords duplicates #' @export #' @seealso #' [data_unique()] #' @return A dataframe, containing all duplicates. #' @examples #' df1 <- data.frame( #' id = c(1, 2, 3, 1, 3), #' year = c(2022, 2022, 2022, 2022, 2000), #' item1 = c(NA, 1, 1, 2, 3), #' item2 = c(NA, 1, 1, 2, 3), #' item3 = c(NA, 1, 1, 2, 3) #' ) #' #' data_duplicated(df1, select = "id") #' #' data_duplicated(df1, select = c("id", "year")) #' #' # Filter to exclude duplicates #' df2 <- df1[-c(1, 5), ] #' df2 #' data_duplicated <- function( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) { UseMethod("data_duplicated") } #' @export data_duplicated.data.frame <- function( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) { select <- .select_nse( select, data, exclude = exclude, ignore_case = ignore_case, regex = regex, verbose = verbose ) data$temporary_id <- do.call(paste, c(data_select(data, select), sep = "_")) data <- cbind(Row = seq_len(nrow(data)), data) dups.index <- data$temporary_id %in% data$temporary_id[duplicated(data$temporary_id)] dups <- data[dups.index, ] dups$count_na <- rowSums(is.na(dups)) dups <- data_arrange(dups, select) dups <- data_remove(dups, "temporary_id") dups } #' @export data_duplicated.grouped_df <- function( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) { select <- .select_nse( select, data, exclude = exclude, ignore_case = ignore_case, regex = regex, verbose = verbose ) grps <- attr(data, "groups", exact = TRUE) grps <- grps[[".rows"]] data <- as.data.frame(data) out <- lapply(grps, function(x) { data_duplicated.data.frame(data[x, ], select = select) }) out <- do.call(rbind, out) out } ================================================ FILE: R/data_extract.R ================================================ #' Extract one or more columns or elements from an object #' #' `data_extract()` (or its alias `extract()`) is similar to `$`. It extracts #' either a single column or element from an object (e.g., a data frame, list), #' or multiple columns resp. elements. #' #' @param data The object to subset. Methods are currently available for data frames #' and data frame extensions (e.g., tibbles). #' @param name An optional argument that specifies the column to be used as #' names for the vector elements after extraction. Must be specified either #' as literal variable name (e.g., `column_name`) or as string #' (`"column_name"`). `name` will be ignored when a data frame is returned. #' @param extract String, indicating which element will be extracted when `select` #' matches multiple variables. Can be `"all"` (the default) to return all #' matched variables, `"first"` or `"last"` to return the first or last match, #' or `"odd"` and `"even"` to return all odd-numbered or even-numbered #' matches. Note that `"first"` or `"last"` return a vector (unless #' `as_data_frame = TRUE`), while `"all"` can return a vector (if only one #' match was found) *or* a data frame (for more than one match). Type safe #' return values are only possible when `extract` is `"first"` or `"last"` (will #' always return a vector) or when `as_data_frame = TRUE` (always returns a #' data frame). #' @param as_data_frame Logical, if `TRUE`, will always return a data frame, #' even if only one variable was matched. If `FALSE`, either returns a vector #' or a data frame. See `extract` for details. #' @param verbose Toggle warnings. #' @param ... For use by future methods. #' #' @inheritParams extract_column_names #' #' @details `data_extract()` can be used to select multiple variables or pull a #' single variable from a data frame. Thus, the return value is by default not #' type safe - `data_extract()` either returns a vector or a data frame. #' \subsection{Extracting single variables (vectors)}{ #' When `select` is the name of a single column, or when select only matches #' one column, a vector is returned. A single variable is also returned when #' `extract` is either `"first` or `"last"`. Setting `as_data_frame` to `TRUE` #' overrides this behaviour and *always* returns a data frame. #' } #' \subsection{Extracting a data frame of variables}{ #' When `select` is a character vector containing more than one column name (or #' a numeric vector with more than one valid column indices), or when `select` #' uses one of the supported select-helpers that match multiple columns, a #' data frame is returned. Setting `as_data_frame` to `TRUE` *always* returns #' a data frame. #' } #' #' @return A vector (or a data frame) containing the extracted element, or #' `NULL` if no matching variable was found. #' @export #' #' @examples #' # single variable #' data_extract(mtcars, cyl, name = gear) #' data_extract(mtcars, "cyl", name = gear) #' data_extract(mtcars, -1, name = gear) #' data_extract(mtcars, cyl, name = 0) #' data_extract(mtcars, cyl, name = "row.names") #' #' # selecting multiple variables #' head(data_extract(iris, starts_with("Sepal"))) #' head(data_extract(iris, ends_with("Width"))) #' head(data_extract(iris, 2:4)) #' #' # select first of multiple variables #' data_extract(iris, starts_with("Sepal"), extract = "first") #' #' # select first of multiple variables, return as data frame #' head(data_extract(iris, starts_with("Sepal"), extract = "first", as_data_frame = TRUE)) data_extract <- function(data, select, ...) { UseMethod("data_extract") } #' @rdname data_extract #' @export data_extract.data.frame <- function( data, select, name = NULL, extract = "all", as_data_frame = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { extract <- match.arg( tolower(extract), choices = c("all", "first", "last", "odd", "even") ) # evaluate arguments select <- .select_nse( select, data, exclude = NULL, ignore_case, regex = regex, verbose = verbose ) # nothing to select? if (!length(select)) { return(NULL) } nl <- as.list(seq_along(data)) names(nl) <- names(data) name <- eval(substitute(name), nl, parent.frame()) if (is.numeric(name) && length(name) == 1L) { if (name < 0L) { name <- ncol(data) + name + 1L } else if (name == 0L) { name <- rownames(data) } } else if (is.character(name) && identical(name, "row.names")) { name <- rownames(data) } # chose which matched variables to extract select <- switch( extract, first = select[1L], last = select[length(select)], odd = select[seq(1L, length(select), 2L)], even = select[seq(2L, length(select), 2L)], select ) # "name" only used for naming elements in a vector, not data frame needs_no_names <- isTRUE(as_data_frame) || # more than one variable means data frame, so no name length(select) > 1L || # if we have only one variable, but number of observations not equal to # length of names, we have no proper match, so no naming, too. (length(select) == 1L && length(name) > 1L && length(data[[select]]) != length(name)) if (needs_no_names) { name <- NULL } # we definitely should have a vector here when name not NULL if (is.null(name)) { data[, select, drop = !as_data_frame] } else { # if name indicates a variable, extract values for naming now if (length(name) == 1L) { name <- data[[name]] } stats::setNames(data[[select]], name) } } ================================================ FILE: R/data_group.R ================================================ #' @title Create a grouped data frame #' @name data_group #' #' @description This function is comparable to `dplyr::group_by()`, but just #' following the **datawizard** function design. `data_ungroup()` removes the #' grouping information from a grouped data frame. #' #' @param data A data frame #' @inheritParams extract_column_names #' #' @return A grouped data frame, i.e. a data frame with additional information #' about the grouping structure saved as attributes. #' #' @examplesIf requireNamespace("poorman") #' data(efc) #' suppressPackageStartupMessages(library(poorman, quietly = TRUE)) #' #' # total mean #' efc %>% #' summarize(mean_hours = mean(c12hour, na.rm = TRUE)) #' #' # mean by educational level #' efc %>% #' data_group(c172code) %>% #' summarize(mean_hours = mean(c12hour, na.rm = TRUE)) #' @export data_group <- function( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # variables for grouping select <- .select_nse( select, data, exclude, ignore_case = ignore_case, regex = regex, verbose = verbose ) # create grid with combinations of all levels my_grid <- as.data.frame(expand.grid(lapply(data[select], unique))) # sort grid my_grid <- my_grid[do.call(order, my_grid), , drop = FALSE] .rows <- lapply(seq_len(nrow(my_grid)), function(i) { as.integer(data_match( data, to = my_grid[i, , drop = FALSE], match = "and", return_indices = TRUE, remove_na = FALSE )) }) my_grid[[".rows"]] <- .rows # remove data_match attributes attr(my_grid, "out.attrs") <- NULL attr(my_grid, ".drop") <- TRUE attr(data, "groups") <- my_grid class(data) <- unique(c("grouped_df", "data.frame"), class(data)) data } #' @rdname data_group #' @export data_ungroup <- function(data, verbose = TRUE, ...) { attr(data, "groups") <- NULL class(data) <- unique(setdiff(class(data), "grouped_df")) data } ================================================ FILE: R/data_match.R ================================================ #' Return filtered or sliced data frame, or row indices #' #' Return a filtered (or sliced) data frame or row indices of a data frame that #' match a specific condition. `data_filter()` works like `data_match()`, but works #' with logical expressions or row indices of a data frame to specify matching #' conditions. #' #' @param x A data frame. #' @param to A data frame matching the specified conditions. Note that if #' `match` is a value other than `"and"`, the original row order might be #' changed. See 'Details'. #' @param match String, indicating with which logical operation matching #' conditions should be combined. Can be `"and"` (or `"&"`), `"or"` (or `"|"`) #' or `"not"` (or `"!"`). #' @param return_indices Logical, if `TRUE`, return the vector of rows that #' can be used to filter the original data frame. If `FALSE` (default), #' returns directly the filtered data frame instead of the row indices. #' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before #' filtering the data. This is the default behaviour, however, sometimes when #' row indices are requested (i.e. `return_indices=TRUE`), it might be useful #' to preserve `NA` values, so returned row indices match the row indices of #' the original data frame. #' @param ... A sequence of logical expressions indicating which rows to keep, #' or a numeric vector indicating the row indices of rows to keep. Can also be #' a string representation of a logical expression (e.g. `"x > 4"`), a #' character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains #' the string representation of a logical expression. These might be useful #' when used in packages to avoid defining undefined global variables. #' #' @return A filtered data frame, or the row indices that match the specified #' configuration. #' #' @details For `data_match()`, if `match` is either `"or"` or `"not"`, the #' original row order from `x` might be changed. If preserving row order is #' required, use `data_filter()` instead. #' #' ``` #' # mimics subset() behaviour, preserving original row order #' head(data_filter(mtcars[c("mpg", "vs", "am")], vs == 0 | am == 1)) #' #> mpg vs am #' #> Mazda RX4 21.0 0 1 #' #> Mazda RX4 Wag 21.0 0 1 #' #> Datsun 710 22.8 1 1 #' #> Hornet Sportabout 18.7 0 0 #' #> Duster 360 14.3 0 0 #' #> Merc 450SE 16.4 0 0 #' #' # re-sorting rows #' head(data_match(mtcars[c("mpg", "vs", "am")], #' data.frame(vs = 0, am = 1), #' match = "or")) #' #> mpg vs am #' #> Mazda RX4 21.0 0 1 #' #> Mazda RX4 Wag 21.0 0 1 #' #> Hornet Sportabout 18.7 0 0 #' #> Duster 360 14.3 0 0 #' #> Merc 450SE 16.4 0 0 #' #> Merc 450SL 17.3 0 0 #' ``` #' #' While `data_match()` works with data frames to match conditions against, #' `data_filter()` is basically a wrapper around `subset(subset = )`. #' However, unlike `subset()`, it preserves label attributes and is useful when #' working with labelled data. #' #' @examples #' data_match(mtcars, data.frame(vs = 0, am = 1)) #' data_match(mtcars, data.frame(vs = 0, am = c(0, 1))) #' #' # observations where "vs" is NOT 0 AND "am" is NOT 1 #' data_match(mtcars, data.frame(vs = 0, am = 1), match = "not") #' # equivalent to #' data_filter(mtcars, vs != 0 & am != 1) #' #' # observations where EITHER "vs" is 0 OR "am" is 1 #' data_match(mtcars, data.frame(vs = 0, am = 1), match = "or") #' # equivalent to #' data_filter(mtcars, vs == 0 | am == 1) #' #' # slice data frame by row indices #' data_filter(mtcars, 5:10) #' #' # Define a custom function containing data_filter() #' my_filter <- function(data, variable) { #' data_filter(data, variable) #' } #' my_filter(mtcars, "cyl == 6") #' #' # Pass complete filter-condition as string. #' my_filter <- function(data, condition) { #' data_filter(data, condition) #' } #' my_filter(mtcars, "am != 0") #' #' # string can also be used directly as argument #' data_filter(mtcars, "am != 0") #' #' # or as variable #' fl <- "am != 0" #' data_filter(mtcars, fl) #' @inherit data_rename seealso #' @export data_match <- function( x, to, match = "and", return_indices = FALSE, remove_na = TRUE, ... ) { if (!is.data.frame(to)) { to <- as.data.frame(to) } original_x <- x # evaluate match <- match.arg( tolower(match), c("and", "&", "&&", "or", "|", "||", "!", "not") ) match <- switch( match, `&` = , `&&` = , and = "and", `!` = , not = "not", "or" ) # validation check shared_columns <- intersect(colnames(x), colnames(to)) if (is.null(shared_columns) || length(shared_columns) == 0L) { insight::format_error( "None of the columns from the data frame with matching conditions were found in `x`." ) } # only select common columns x <- x[shared_columns] # prepare if (identical(match, "or")) { idx <- vector("numeric", length = 0L) } else { # remove missings before matching if (isTRUE(remove_na)) { x <- x[stats::complete.cases(x), , drop = FALSE] } idx <- seq_len(nrow(x)) } # Find matching rows for (col in names(to)) { values <- x[[col]] if (match == "or") { idx <- union(idx, which(values %in% to[[col]])) } else if (match == "not") { idx <- idx[!values[idx] %in% to[[col]]] } else { idx <- idx[values[idx] %in% to[[col]]] } } # prepare output if (isFALSE(return_indices)) { out <- original_x[idx, , drop = FALSE] # restore value and variable labels for (i in colnames(out)) { attr(out[[i]], "label") <- attr(original_x[[i]], "label", exact = TRUE) attr(out[[i]], "labels") <- attr(original_x[[i]], "labels", exact = TRUE) } } else { out <- idx } # add back custom attributes out <- .replace_attrs(out, attributes(original_x)) out } #' @rdname data_match #' @export data_filter <- function(x, ...) { UseMethod("data_filter") } #' @export data_filter.data.frame <- function(x, ...) { out <- x # convert tibble to data.frame if (inherits(x, "tbl_df")) { out <- as.data.frame(out, stringsAsFactors = FALSE) tbl_input <- TRUE } else { tbl_input <- FALSE } dots <- match.call(expand.dots = FALSE)[["..."]] if (any(nzchar(names(dots), keepNA = TRUE))) { insight::format_error( "Filtering did not work. Please check if you need `==` (instead of `=`) for comparison." ) } # turn character vector (like `c("mpg <= 20", "cyl == 6")`) into symbols if (length(dots) == 1) { character_vector <- .dynEval(dots[[1]], ifnotfound = NULL) if (is.character(character_vector) && length(character_vector) > 1) { dots <- lapply(character_vector, str2lang) } } # Check syntax of the filter. Must be done *before* calling subset() # (cf easystats/datawizard#237) for (.fcondition in dots) { .check_filter_syntax(insight::safe_deparse(.fcondition)) } for (i in seq_along(dots)) { # only proceed when result is still valid if (!is.null(out)) { symbol <- dots[[i]] # evaluate, we may have a variable with filter expression eval_symbol <- .dynEval(symbol, ifnotfound = NULL) # validation check: is variable named like a function? if (is.function(eval_symbol)) { eval_symbol <- .dynGet(symbol, ifnotfound = NULL) } eval_symbol_numeric <- NULL if (!is.null(eval_symbol)) { # when possible to evaluate, do we have a numeric vector provided # as string? (e.g. `"5:10"`) - then try to coerce to numeric eval_symbol_numeric <- tryCatch( eval(parse(text = eval_symbol)), error = function(e) NULL ) } # here we go when we have a filter expression, and no numeric vector to slice if ( is.null(eval_symbol) || (!is.numeric(eval_symbol) && !is.numeric(eval_symbol_numeric)) ) { # could be evaluated? Then filter expression is a string and we need # to convert into symbol if (is.character(eval_symbol)) { symbol <- str2lang(eval_symbol) } # filter data out <- tryCatch( subset(out, subset = eval(symbol, envir = new.env())), warning = function(e) e, error = function(e) e ) } else if (is.numeric(eval_symbol)) { # if symbol could be evaluated and is numeric, slice out <- tryCatch(out[eval_symbol, , drop = FALSE], error = function(e) { NULL }) } else if (is.numeric(eval_symbol_numeric)) { # if symbol could be evaluated, was string and could be converted to numeric, slice out <- tryCatch( out[eval_symbol_numeric, , drop = FALSE], error = function(e) NULL ) } if (inherits(out, c("simpleError", "objectNotFoundError"))) { error_msg <- out$message[1] # try to find out which variable was the cause for the error if (grepl("object '(.*)' not found", error_msg)) { error_var <- gsub("object '(.*)' not found", "\\1", error_msg) # some syntax errors do not relate to misspelled variables... if (!error_var %in% colnames(x)) { insight::format_error( paste0( "Variable \"", error_var, "\" was not found in the dataset." ), .misspelled_string(colnames(x), error_var, "Possibly misspelled?") ) } } out <- NULL } } } if (is.null(out)) { insight::format_error( "Filtering did not work. Please check the syntax of your conditions." ) } # restore value and variable labels for (i in colnames(out)) { attr(out[[i]], "label") <- attr(x[[i]], "label", exact = TRUE) attr(out[[i]], "labels") <- attr(x[[i]], "labels", exact = TRUE) } # add back custom attributes out <- .replace_attrs(out, attributes(x)) # add back tidyverse attributes if (isTRUE(tbl_input)) { class(out) <- c("tbl_df", "tbl", "data.frame") } out } #' @export data_filter.grouped_df <- function(x, ...) { original_x <- x grps <- attr(x, "groups", exact = TRUE) grps <- grps[[".rows"]] # Remove tidyverse attributes, will add them back at the end if (inherits(x, "tbl_df")) { tbl_input <- TRUE x <- as.data.frame(x, stringsAsFactors = FALSE) } else { tbl_input <- FALSE } dots <- match.call(expand.dots = FALSE)[["..."]] out <- lapply(grps, function(grp) { arguments <- list(x[grp, ]) arguments <- c(arguments, dots) do.call("data_filter.data.frame", arguments) }) out <- do.call(rbind, out) if (!insight::object_has_rownames(x)) { rownames(out) <- NULL } # add back tidyverse attributes if (isTRUE(tbl_input)) { class(out) <- c("tbl_df", "tbl", "data.frame") } # add back custom attributes out <- .replace_attrs(out, attributes(original_x)) out } # helper ------------------- .check_filter_syntax <- function(.fcondition) { # NOTE: We cannot check for `=` when "filter" is not a character vector # because the function will then fail in general. I.e., # "data_filter(mtcars, filter = mpg > 10 & cyl = 4)" will not start # running this function and never reaches the first code line, # but immediately stops... tmp <- gsub("==", "", .fcondition, fixed = TRUE) tmp <- gsub("<=", "", tmp, fixed = TRUE) tmp <- gsub(">=", "", tmp, fixed = TRUE) tmp <- gsub("!=", "", tmp, fixed = TRUE) # We want to check whether user used a "=" in the filter syntax. This # typically indicates that the comparison "==" is probably wrong by using # a "=" instead of `"=="`. However, if a function was provided, we indeed # may have "=", e.g. if the pattern was # `data_filter(out, grep("pattern", x = value))`. We thus first check if we # can identify a function call, and only continue checking for wrong syntax # when we have not identified a function. if ( !is.function(tryCatch( get(gsub("^(.*?)\\((.*)", "\\1", tmp)), error = function(e) NULL )) ) { # Give more informative message to users # about possible misspelled comparisons / logical conditions # check if "=" instead of "==" was used? if (any(grepl("=", tmp, fixed = TRUE))) { insight::format_error( "Filtering did not work. Please check if you need `==` (instead of `=`) for comparison." ) } # check if "&&" etc instead of "&" was used? logical_operator <- NULL if (any(grepl("&&", .fcondition, fixed = TRUE))) { logical_operator <- "&&" } if (any(grepl("||", .fcondition, fixed = TRUE))) { logical_operator <- "||" } if (!is.null(logical_operator)) { insight::format_error( paste0( "Filtering did not work. Please check if you need `", substr(logical_operator, 0, 1), "` (instead of `", logical_operator, "`) as logical operator." ) ) } } } ================================================ FILE: R/data_merge.R ================================================ #' @title Merge (join) two data frames, or a list of data frames #' @name data_merge #' #' @description #' Merge (join) two data frames, or a list of data frames. However, unlike #' base R's `merge()`, `data_merge()` offers a few more methods to join data #' frames, and it does not drop data frame nor column attributes. #' #' @param x,y A data frame to merge. `x` may also be a list of data frames #' that will be merged. Note that the list-method has no `y` argument. #' @param join Character vector, indicating the method of joining the data frames. #' Can be `"full"`, `"left"` (default), `"right"`, `"inner"`, `"anti"`, `"semi"` #' or `"bind"`. See details below. #' @param by Specifications of the columns used for merging. #' @param id Optional name for ID column that will be created to indicate the #' source data frames for appended rows. Only applies if `join = "bind"`. #' @param verbose Toggle warnings. #' @param ... Not used. #' #' @return #' A merged data frame. #' #' @section Merging data frames: #' #' Merging data frames is performed by adding rows (cases), columns #' (variables) or both from the source data frame (`y`) to the target #' data frame (`x`). This usually requires one or more variables which #' are included in both data frames and that are used for merging, typically #' indicated with the `by` argument. When `by` contains a variable present #' in both data frames, cases are matched and filtered by identical values #' of `by` in `x` and `y`. #' #' @section Left- and right-joins: #' #' Left- and right joins usually don't add new rows (cases), but only new #' columns (variables) for existing cases in `x`. For `join = "left"` or #' `join = "right"` to work, `by` *must* indicate one or more columns that #' are included in both data frames. For `join = "left"`, if `by` is an #' identifier variable, which is included in both `x` and `y`, all variables #' from `y` are copied to `x`, but only those cases from `y` that have #' matching values in their identifier variable in `x` (i.e. all cases #' in `x` that are also found in `y` get the related values from the new #' columns in `y`). If there is no match between identifiers in `x` and `y`, #' the copied variable from `y` will get a `NA` value for this particular #' case. Other variables that occur both in `x` and `y`, but are not used #' as identifiers (with `by`), will be renamed to avoid multiple identical #' variable names. Cases in `y` where values from the identifier have no #' match in `x`'s identifier are removed. `join = "right"` works in #' a similar way as `join = "left"`, just that only cases from `x` that #' have matching values in their identifier variable in `y` are chosen. #' #' In base R, these are equivalent to `merge(x, y, all.x = TRUE)` and #' `merge(x, y, all.y = TRUE)`. #' #' @section Full joins: #' #' Full joins copy all cases from `y` to `x`. For matching cases in both #' data frames, values for new variables are copied from `y` to `x`. For #' cases in `y` not present in `x`, these will be added as new rows to `x`. #' Thus, full joins not only add new columns (variables), but also might #' add new rows (cases). #' #' In base R, this is equivalent to `merge(x, y, all = TRUE)`. #' #' @section Inner joins: #' #' Inner joins merge two data frames, however, only those rows (cases) are #' kept that are present in both data frames. Thus, inner joins usually #' add new columns (variables), but also remove rows (cases) that only #' occur in one data frame. #' #' In base R, this is equivalent to `merge(x, y)`. #' #' @section Binds: #' #' `join = "bind"` row-binds the complete second data frame `y` to `x`. #' Unlike simple `rbind()`, which requires the same columns for both data #' frames, `join = "bind"` will bind shared columns from `y` to `x`, and #' add new columns from `y` to `x`. #' #' @examples #' #' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3) #' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4) #' #' x #' y #' #' # "by" will default to all shared columns, i.e. "c" and "id". new columns #' # "d" and "e" will be copied from "y" to "x", but there are only two cases #' # in "x" that have the same values for "c" and "id" in "y". only those cases #' # have values in the copied columns, the other case gets "NA". #' data_merge(x, y, join = "left") #' #' # we change the id-value here #' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3) #' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 3:5) #' #' x #' y #' #' # no cases in "y" have the same matching "c" and "id" as in "x", thus #' # copied variables from "y" to "x" copy no values, all get NA. #' data_merge(x, y, join = "left") #' #' # one case in "y" has a match in "id" with "x", thus values for this #' # case from the remaining variables in "y" are copied to "x", all other #' # values (cases) in those remaining variables get NA #' data_merge(x, y, join = "left", by = "id") #' #' data(mtcars) #' x <- mtcars[1:5, 1:3] #' y <- mtcars[28:32, 4:6] #' #' # add ID common column #' x$id <- 1:5 #' y$id <- 3:7 #' #' # left-join, add new variables and copy values from y to x, #' # where "id" values match #' data_merge(x, y) #' #' # right-join, add new variables and copy values from x to y, #' # where "id" values match #' data_merge(x, y, join = "right") #' #' # full-join #' data_merge(x, y, join = "full") #' #' #' data(mtcars) #' x <- mtcars[1:5, 1:3] #' y <- mtcars[28:32, c(1, 4:5)] #' #' # add ID common column #' x$id <- 1:5 #' y$id <- 3:7 #' #' # left-join, no matching rows (because columns "id" and "disp" are used) #' # new variables get all NA values #' data_merge(x, y) #' #' # one common value in "mpg", so one row from y is copied to x #' data_merge(x, y, by = "mpg") #' #' # only keep rows with matching values in by-column #' data_merge(x, y, join = "semi", by = "mpg") #' #' # only keep rows with non-matching values in by-column #' data_merge(x, y, join = "anti", by = "mpg") #' #' # merge list of data frames. can be of different rows #' x <- mtcars[1:5, 1:3] #' y <- mtcars[28:31, 3:5] #' z <- mtcars[11:18, c(1, 3:4, 6:8)] #' x$id <- 1:5 #' y$id <- 4:7 #' z$id <- 3:10 #' data_merge(list(x, y, z), join = "bind", by = "id", id = "source") #' @inherit data_rename seealso #' @export data_merge <- function(x, ...) { UseMethod("data_merge") } #' @rdname data_merge #' @export data_join <- data_merge #' @rdname data_merge #' @export data_merge.data.frame <- function( x, y, join = "left", by = NULL, id = NULL, verbose = TRUE, ... ) { class_x <- class(x) # save variable attributes attr_x_vars <- lapply(x, attributes) attr_y_vars <- lapply(y, attributes) attr_vars <- c( attr_x_vars, attr_y_vars[names(attr_y_vars)[!names(attr_y_vars) %in% names(attr_x_vars)]] ) # check join-argument ---------------------- join <- match.arg( join, choices = c("full", "left", "right", "inner", "semi", "anti", "bind") ) # check id-argument ---------------------- all_columns <- union(colnames(x), colnames(y)) if (join == "bind" && !is.null(id) && id %in% all_columns) { # ensure unique ID id <- make.unique(c(all_columns, id), sep = "_")[length(all_columns) + 1] # and also tell user... if (isTRUE(verbose)) { insight::format_warning( sprintf( "Value of `id` already exists as column name. ID column was renamed to `%s`.", id ) ) } } if (!is.null(id) && join == "bind") { x[[id]] <- 1 y[[id]] <- 2 } # check merge columns ("by"-argument) ---------------------- if (join != "bind") { # we need a value for "by". If not provided, use all shared column names if (is.null(by)) { by <- intersect(colnames(x), colnames(y)) } # If not all column names specified in "by" are present, yield warning # and use all shared column names if (!all(by %in% colnames(x)) || !all(by %in% colnames(y))) { missing_in_x <- setdiff(by, colnames(x)) missing_in_y <- setdiff(by, colnames(y)) stop_message <- c( "Not all columns specified in `by` were found in the data frames.", if (length(missing_in_x) > 0L) { paste0( "Following columns are in `by` but absent in `x`: ", text_concatenate(missing_in_x) ) }, if (length(missing_in_y) > 0L) { paste0( "Following columns are in `by` but absent in `y`: ", text_concatenate(missing_in_y) ) } ) if (isTRUE(verbose)) { insight::format_error(stop_message) } } # if still both data frames have no common columns, do a full join if (!length(by)) { if (isTRUE(verbose)) { insight::format_warning( "Found no matching columns in the data frames. Fully merging both data frames now.", "Note that this can lead to unintended results, because rows in `x` and `y` are possibly duplicated.", "You probably want to use `data_merge(x, y, join = \"bind\")` instead." ) } by <- NULL join <- "full" } } # check valid combination of "join" and "by" ----------------------- if (join %in% c("anti", "semi") && (is.null(by) || length(by) != 1)) { insight::format_error( sprintf( "For `join = \"%s\"`, `by` needs to be a name of only one variable that is present in both data frames.", join ) ) } # merge -------------------- # for later sorting if (join != "bind") { if (nrow(x) > 0L) { x$.data_merge_id_x <- seq_len(nrow(x)) } if (nrow(y) > 0L) { y$.data_merge_id_y <- (seq_len(nrow(y))) + nrow(x) } } all_columns <- union(colnames(x), colnames(y)) out <- switch( join, full = merge(x, y, all = TRUE, sort = FALSE, by = by), left = merge(x, y, all.x = TRUE, sort = FALSE, by = by), right = merge(x, y, all.y = TRUE, sort = FALSE, by = by), inner = merge(x, y, sort = FALSE, by = by), semi = x[x[[by]] %in% y[[by]], , drop = FALSE], anti = x[!x[[by]] %in% y[[by]], , drop = FALSE], bind = .bind_data_frames(x, y) ) # sort rows, add attributes, and return results ------------------------- if (".data_merge_id_x" %in% colnames(out)) { # for full joins, we have no complete sorting id, but NAs for each # data frame. we now "merge" the two sorting IDs from each data frame. if (anyNA(out$.data_merge_id_x) && ".data_merge_id_y" %in% colnames(out)) { out$.data_merge_id_x[is.na( out$.data_merge_id_x )] <- out$.data_merge_id_y[is.na(out$.data_merge_id_x)] } out <- out[order(out$.data_merge_id_x), ] out$.data_merge_id_x <- NULL out$.data_merge_id_y <- NULL } # try to restore original column order as good as possible. Therefore, we # first take all column names of the original input data frames, then # we add all new columns, like duplicated from merging (name.x and name.y, # if "name" was in both data frames, but not used in "by"), and then do a # final check that all column names are present in "out" (e.g., "name" would) # no longer be there if we have "name.x" and "name.y"). all_columns <- c(all_columns, setdiff(colnames(out), all_columns)) all_columns <- all_columns[all_columns %in% colnames(out)] out <- out[all_columns] # add back attributes out <- .replace_attrs(out, attributes(y)) out <- .replace_attrs(out, attributes(x)) for (i in colnames(out)) { if (is.list(attr_vars[[i]])) { if (is.list(attributes(out[[i]]))) { attributes(out[[i]]) <- utils::modifyList( attr_vars[[i]], attributes(out[[i]]) ) } else { attributes(out[[i]]) <- attr_vars[[i]] } } } class(out) <- unique(c(class_x, "data.frame")) out } #' @rdname data_merge #' @export data_merge.list <- function( x, join = "left", by = NULL, id = NULL, verbose = TRUE, ... ) { out <- x[[1]] df_id <- rep(1, times = nrow(out)) for (i in 2:length(x)) { out <- data_merge( out, x[[i]], join = join, by = by, id = NULL, verbose = verbose, ... ) df_id <- c(df_id, rep(i, times = nrow(x[[i]]))) } # we need separate handling for list of data frames and id-variable here if (!is.null(id) && join == "bind") { if (id %in% colnames(out)) { # ensure unique ID id <- make.unique(c(colnames(out), id), sep = "_")[ length(colnames(out)) + 1 ] # and also tell user... if (isTRUE(verbose)) { insight::format_warning( sprintf( "Value of `id` already exists as column name. ID column was renamed to `%s`.", id ) ) } } out[[id]] <- df_id } out } .bind_data_frames <- function(x, y) { # merge and sort. "rbind()" is faster than "merge()" if all columns present if (all(colnames(x) %in% colnames(y)) && ncol(x) == ncol(y)) { # we may have different column order out <- rbind(x, y[match(colnames(x), colnames(y))]) } else { # add ID for merging if (nrow(x) > 0L) { x$.data_merge_row <- seq_len(nrow(x)) } if (nrow(y) > 0L) { y$.data_merge_row <- (nrow(x) + 1):(nrow(x) + nrow(y)) } merge_by <- intersect(colnames(x), colnames(y)) out <- merge(x, y, all = TRUE, sort = FALSE, by = merge_by) } # for empty df's, merge() may return an empty character vector # make sure it's a data frame object. if (!is.data.frame(out)) { out <- as.data.frame(out) } if (".data_merge_row" %in% colnames(out)) { out <- out[order(out$.data_merge_row), ] } out$.data_merge_row <- NULL out } ================================================ FILE: R/data_modify.R ================================================ #' Create new variables in a data frame #' #' Create new variables or modify existing variables in a data frame. Unlike `base::transform()`, `data_modify()` #' can be used on grouped data frames, and newly created variables can be directly #' used. #' #' @param data A data frame #' @param ... One or more expressions that define the new variable name and the #' values or recoding of those new variables. These expressions can be one of: #' - A sequence of named, literal expressions, where the left-hand side refers #' to the name of the new variable, while the right-hand side represent the #' values of the new variable. Example: `Sepal.Width = center(Sepal.Width)`. #' - A vector of length 1 (which will be recycled to match the number of rows #' in the data), or of same length as the data. #' - A variable that contains a value to be used. Example: #' ```r #' a <- "abc" #' data_modify(iris, var_abc = a) # var_abc contains "abc" #' ``` #' - An expression can also be provided as string and wrapped in #' `as_expr()`. Example: #' ```r #' data_modify(iris, as_expr("Sepal.Width = center(Sepal.Width)")) #' # or #' a <- "center(Sepal.Width)" #' data_modify(iris, Sepal.Width = as_expr(a)) #' # or #' a <- "Sepal.Width = center(Sepal.Width)" #' data_modify(iris, as_expr(a)) #' ``` #' Note that `as_expr()` is no real function, which cannot be used outside #' of `data_modify()`, and hence it is not exported nor documented. Rather, #' it is only used for internally processing expressions. #' - Using `NULL` as right-hand side removes a variable from the data frame. #' Example: `Petal.Width = NULL`. #' - For data frames (including grouped ones), the function `n()` can be used to #' count the number of observations and thereby, for instance, create index #' values by using `id = 1:n()` or `id = 3:(n()+2)` and similar. Note that, #' like `as_expr()`, `n()` is also no true function and cannot be used outside #' of `data_modify()`. #' #' Note that newly created variables can be used in subsequent expressions, #' including `.at` or `.if`. See also 'Examples'. #' #' @param .at A character vector of variable names that should be modified. This #' argument is used in combination with the `.modify` argument. Note that only one #' of `.at` or `.if` can be provided, but not both at the same time. Newly created #' variables in `...` can also be selected, see 'Examples'. #' @param .if A function that returns `TRUE` for columns in the data frame where #' `.if` applies. This argument is used in combination with the `.modify` argument. #' Note that only one of `.at` or `.if` can be provided, but not both at the same #' time. Newly created variables in `...` can also be selected, see 'Examples'. #' @param .modify A function that modifies the variables defined in `.at` or `.if`. #' This argument is used in combination with either the `.at` or the `.if` argument. #' Note that the modified variable (i.e. the result from `.modify`) must be either #' of length 1 or of same length as the input variable. #' #' @note `data_modify()` can also be used inside functions. However, it is #' recommended to pass the recode-expression as character vector or list of #' characters. #' #' @examples #' data(efc) #' new_efc <- data_modify( #' efc, #' c12hour_c = center(c12hour), #' c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), #' c12hour_z2 = standardize(c12hour) #' ) #' head(new_efc) #' #' # using strings instead of literal expressions #' new_efc <- data_modify( #' efc, #' as_expr("c12hour_c = center(c12hour)"), #' as_expr("c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)"), #' as_expr("c12hour_z2 = standardize(c12hour)") #' ) #' head(new_efc) #' #' # using a character vector, provided a variable #' xpr <- c( #' "c12hour_c = center(c12hour)", #' "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)", #' "c12hour_z2 = standardize(c12hour)" #' ) #' new_efc <- data_modify(efc, as_expr(xpr)) #' head(new_efc) #' #' # using character strings, provided as variable #' stand <- "c12hour_c / sd(c12hour, na.rm = TRUE)" #' new_efc <- data_modify( #' efc, #' c12hour_c = center(c12hour), #' c12hour_z = as_expr(stand) #' ) #' head(new_efc) #' #' # attributes - in this case, value and variable labels - are preserved #' str(new_efc) #' #' # using `paste()` to build a string-expression #' to_standardize <- c("Petal.Length", "Sepal.Length") #' out <- data_modify( #' iris, #' as_expr( #' paste0(to_standardize, "_stand = standardize(", to_standardize, ")") #' ) #' ) #' head(out) #' #' # overwrite existing variable, remove old variable #' out <- data_modify(iris, Petal.Length = 1 / Sepal.Length, Sepal.Length = NULL) #' head(out) #' #' # works on grouped data #' grouped_efc <- data_group(efc, "c172code") #' new_efc <- data_modify( #' grouped_efc, #' c12hour_c = center(c12hour), #' c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), #' c12hour_z2 = standardize(c12hour), #' id = 1:n() #' ) #' head(new_efc) #' #' # works from inside functions #' foo1 <- function(data, ...) { #' head(data_modify(data, ...)) #' } #' foo1(iris, SW_fraction = Sepal.Width / 10) #' # or #' foo1(iris, as_expr("SW_fraction = Sepal.Width / 10")) #' #' # also with string arguments, using `as_expr()` #' foo2 <- function(data, modification) { #' head(data_modify(data, as_expr(modification))) #' } #' foo2(iris, "SW_fraction = Sepal.Width / 10") #' #' # modify at specific positions or if condition is met #' d <- iris[1:5, ] #' data_modify(d, .at = "Species", .modify = as.numeric) #' data_modify(d, .if = is.factor, .modify = as.numeric) #' #' # can be combined with dots #' data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric) #' #' # new variables used in `.at` or `.if` #' data_modify( #' d, #' new_length = Petal.Length * 2, #' .at = c("Petal.Length", "new_length"), #' .modify = round #' ) #' #' # combine "extract_column_names()" and ".at" argument #' out <- data_modify( #' d, #' .at = extract_column_names(d, select = starts_with("Sepal")), #' .modify = as.factor #' ) #' # "Sepal.Length" and "Sepal.Width" are now factors #' str(out) #' #' @export data_modify <- function(data, ...) { UseMethod("data_modify") } #' @export data_modify.default <- function(data, ...) { insight::format_error("`data` must be a data frame.") } #' @rdname data_modify #' @export data_modify.data.frame <- function( data, ..., .if = NULL, .at = NULL, .modify = NULL ) { dots <- eval(substitute(alist(...))) # error for data frames with no rows... if (nrow(data) == 0) { insight::format_error( "`data` is an empty data frame. `data_modify()` only works for data frames with at least one row." ) # nolint } # check if we have dots, or only at/modify ---- if (length(dots)) { # Check if dots are named. Usually, all dots should be named, i.e. include # the name of the new variable. There's only one exception, if a string is # masked as expression, and this string includes the new name, e.g. # # data_modify(iris, as_expr("sepwid = 2 * Sepal.Width")) # a <- "sepwid = 2 * Sepal.Width" # data_modify(iris, as_expr(a)) # dots <- .process_unnamed_expressions(dots, data) # next, we check for named expression-tags and convert these into regular # expressions, e.g. # # data_modify(iris, sepwid = = as_expr("2 * Sepal.Width")) # a <- "2 * Sepal.Width" # data_modify(iris, sepwid = as_expr(a)) # dots <- .process_named_expressions(dots, data) for (i in seq_along(dots)) { # create new variable new_variable <- .get_new_dots_variable(dots, i, data) # give informative error when new variable doesn't match number of rows if ( !is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0 ) { insight::format_error( "New variable has not the same length as the other variables in the data frame and cannot be recycled." ) } data[[names(dots)[i]]] <- new_variable } } # check if we have at/modify ---- data <- .modify_at(data, .at, .if, .modify) data } #' @export data_modify.grouped_df <- function( data, ..., .if = NULL, .at = NULL, .modify = NULL ) { # we need to evaluate dots here, and pass them with "do.call" to # the data.frame method later... dots <- match.call(expand.dots = FALSE)[["..."]] # error for data frames with no rows... if (nrow(data) == 0) { insight::format_error( "`data` is an empty data frame. `data_modify()` only works for data frames with at least one row." ) # nolint } grps <- attr(data, "groups", exact = TRUE) grps <- grps[[".rows"]] attr_data <- attributes(data) # remove conflicting class attributes class_attr <- class(data) data <- as.data.frame(data) if (length(dots)) { # check is dots are named. Usually, all dots should be named, i.e. include # the name of the new variable. There's only one exception, if a string is # masked as expression, and this string includes the new name, e.g. # # data_modify(iris, as_expr("sepwid = 2 * Sepal.Width")) # a <- "sepwid = 2 * Sepal.Width" # data_modify(iris, as_expr(a)) # dots <- .process_unnamed_expressions(dots, data) # next, we check for named expression-tags and convert these into regular # expressions, e.g. # # data_modify(iris, sepwid = = as_expr("2 * Sepal.Width")) # a <- "2 * Sepal.Width" # data_modify(iris, sepwid = as_expr(a)) # dots <- .process_named_expressions(dots, data) } # create new variables as dummys, do for-loop works for (i in names(dots)) { # don't overwrite / fill existing variables with NA, # e.g. if we have "data_modify(iris, Sepal.Length = normalize(Sepal.Length))" # normalize() won't work when we fill with NA if (!i %in% colnames(data)) { data[[i]] <- NA } } # create new variables per group for (rows in grps) { data[rows, ] <- data_modify.data.frame(data[rows, ], ...) } # check if we have at/modify ---- data <- .modify_at(data, .at, .if, .modify) # set back attributes and class data <- .replace_attrs(data, attr_data) class(data) <- class_attr data } # expression processing ---------------------------------------------------- .process_unnamed_expressions <- function(dots, data) { # dots are only unnamed, when the full expression is saved in a string, # e.g. data_modify(iris, as_expr("sepwid = 2 * Sepal.Width")). # Thus, we know we *have to* find an expression here, and the string value # *must* contain a name definition. If not, fail. If yes, convert string # into a language expression... if (!is.null(names(dots)) && all(nzchar(names(dots)))) { # if all elements are named, return early return(dots) } # find which dots are unnamed, check those for expressions if (is.null(names(dots))) { unnamed_dots <- seq_along(dots) } else { unnamed_dots <- which(!nzchar(names(dots))) } for (i in rev(unnamed_dots)) { # copy dot-element and convert to string for manipulation dot_element <- dots[[i]] symbol_string <- insight::safe_deparse(dot_element) # sanity check - this may happen when user wants to remove a variable, # e.g. data_modify(iris, as_expr("Species = NULL")) if (is.null(symbol_string)) { next } # we only allow unnamed elements if these are masked as expression. String # values or numeric values require a named element, i.e. we can only have # data_modify(iris, newvar = "a"), but we cannot have data_modify(iris, "a"). # For expression, missing name is possible. if (!startsWith(symbol_string, "as_expr")) { insight::format_error(paste0( "A variable name for the expression `", symbol_string, "` is missing. ", "Please use something like `new_name = ", symbol_string, "`." )) } # next, check if the string-expression includes a name for the new variable # therefore, we remove the "as_expr()" token if (startsWith(symbol_string, "as_expr")) { symbol_string <- insight::trim_ws( gsub("as_expr\\((.*)\\)", "\\1", symbol_string) ) } # remove c(), split at comma, if we have a vector of expressions if (startsWith(symbol_string, "c(")) { symbol_string <- gsub("c\\((.*)\\)", "\\1", symbol_string) # only split at highest-level comma pattern <- ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)" # suggestion by Co-pilot # Locate commas not inside quotes symbol_string <- insight::trim_ws(unlist( strsplit(symbol_string, pattern, perl = TRUE), use.names = FALSE )) } # check if we have any symbols instead of strings as expression, e.g. # xpr <- "sepwid = 2 * Sepal.Width" # data_modify(iris, as_expr(xpr)) # # in this case, we need to evaluate the symbol (i.e. convert symbol string # into a language expression and then evaluate) symbol_string <- .evaluate_expression_in_string(symbol_string, data) # check whether we have exact one = sign. We need to have a name definition, # i.e. something like "var = a+b" - if the string has no "=" sign, name is # definitely missing pattern <- "(? 1, "s ", " "), text_concatenate(not_found, enclose = "\""), ifelse(length(not_found) > 1, " were", " was"), " not found in the dataset." ), .misspelled_string( column_names, not_found, "Possibly misspelled or not yet defined?" ) ) } for (i in .at) { result <- tryCatch( .modify(data[[i]]), warning = function(e) e, error = function(e) e ) if (inherits(result, c("error", "warning"))) { insight::format_error( paste0("Error in modifying variable \"", i, "\": ", result$message), "Please check if you correctly specified the `.modify` function." ) } else { data[[i]] <- result } } data } .get_new_dots_variable <- function(dots, i, data) { # iterate expressions for new variables symbol <- dots[[i]] # we evaluate the content of "symbol", hence, "eval_symbol" either contains # the values of the expression, or the expression itself as string eval_symbol <- .dynEval(symbol, ifnotfound = NULL, data = data) # finally, we can evaluate expression and get values for new variables symbol_string <- insight::safe_deparse(symbol) if (!is.null(symbol_string) && all(symbol_string == "n()")) { # "special" functions - using "n()" just returns number of rows new_variable <- nrow(data) } else if ( !is.null(symbol_string) && length(symbol_string) == 1 && grepl("\\bn\\(\\)", symbol_string) ) { # "special" functions, like "1:n()" or similar - but not "1:fun()" symbol_string <- str2lang(gsub( "n()", "nrow(data)", symbol_string, fixed = TRUE )) new_variable <- try(with(data, eval(symbol_string)), silent = TRUE) } else { # evaluate symbol new_variable <- try(with(data, eval(symbol)), silent = TRUE) # if evaluation fails, we have a value - and directly use it if (inherits(new_variable, "try-error") && !is.null(eval_symbol)) { new_variable <- eval_symbol } } # successful, or any errors, like misspelled variable name? if (inherits(new_variable, "try-error")) { # in which step did error happen? step_number <- switch( as.character(i), "1" = "the first expression", "2" = "the second expression", "3" = "the third expression", paste("expression", i) ) step_msg <- paste0("There was an error in ", step_number, ".") # try to find out which variable was the cause for the error error_msg <- attributes(new_variable)$condition$message if (grepl("object '(.*)' not found", error_msg)) { error_var <- gsub("object '(.*)' not found", "\\1", error_msg) insight::format_error( paste0( step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment." ), .misspelled_string( colnames(data), error_var, "Possibly misspelled or not yet defined?" ) ) } else { insight::format_error(paste0( step_msg, " ", insight::format_capitalize(error_msg), ". Possibly misspelled or not yet defined?" )) } } new_variable } ================================================ FILE: R/data_partition.R ================================================ #' Partition data #' #' Creates data partitions (for instance, a training and a test set) based on a #' data frame that can also be stratified (i.e., evenly spread a given factor) #' using the `by` argument. #' #' @inheritParams data_rename #' @param proportion Scalar (between 0 and 1) or numeric vector, indicating the #' proportion(s) of the training set(s). The sum of `proportion` must not be #' greater than 1. The remaining part will be used for the test set. #' @param by A character vector indicating the name(s) of the column(s) used #' for stratified partitioning. #' @param seed A random number generator seed. Enter an integer (e.g. 123) so #' that the random sampling will be the same each time you run the function. #' @param row_id Character string, indicating the name of the column that #' contains the row-id's. #' @param verbose Toggle messages and warnings. #' #' @return A list of data frames. The list includes one training set per given #' proportion and the remaining data as test set. List elements of training #' sets are named after the given proportions (e.g., `$p_0.7`), the test set #' is named `$test`. #' #' @examples #' data(iris) #' out <- data_partition(iris, proportion = 0.9) #' out$test #' nrow(out$p_0.9) #' #' # Stratify by group (equal proportions of each species) #' out <- data_partition(iris, proportion = 0.9, by = "Species") #' out$test #' #' # Create multiple partitions #' out <- data_partition(iris, proportion = c(0.3, 0.3)) #' lapply(out, head) #' #' # Create multiple partitions, stratified by group - 30% equally sampled #' # from species in first training set, 50% in second training set and #' # remaining 20% equally sampled from each species in test set. #' out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species") #' lapply(out, function(i) table(i$Species)) #' #' @inherit data_rename seealso #' @export data_partition <- function( data, proportion = 0.7, by = NULL, seed = NULL, row_id = ".row_id", verbose = TRUE, ... ) { # validation checks data <- .coerce_to_dataframe(data) if (sum(proportion) > 1) { insight::format_error("Sum of `proportion` cannot be higher than 1.") } if (any(proportion < 0)) { insight::format_error("Values in `proportion` cannot be negative.") } if (sum(proportion) == 1 && isTRUE(verbose)) { insight::format_warning( "Proportions of sampled training sets (`proportion`) sums up to 1, so no test set will be generated." ) } if (is.null(row_id)) { row_id <- ".row_id" } # check that name of row-id doesn't exist to prevent existing data # from overwriting. create new unique name for row-id then... if (row_id %in% colnames(data)) { if (isTRUE(verbose)) { insight::format_warning( paste0("A variable named \"", row_id, "\" already exists."), "Changing the value of `row_id` to a unique variable name now." ) } unique_names <- make.unique(c(colnames(data), row_id), sep = "_") row_id <- unique_names[length(unique_names)] } if (!is.null(seed)) { set.seed(seed) } # add row-id column data[[row_id]] <- seq_len(nrow(data)) # Create list of data groups. We generally lapply over list of # sampled row-id's by group, thus, we even create a list if not grouped. if (is.null(by)) { indices_list <- list(seq_len(nrow(data))) } else { # else, split by group(s) and extract row-ids per group indices_list <- lapply( split(data, data[by]), data_extract, select = row_id, as_data_frame = FALSE ) } # iterate over (grouped) row-id's training_sets <- lapply(indices_list, function(i) { # return value, list of data frames d <- list() # row-id's by groups indices <- i # check length of group (= data) n <- length(indices) # iterate probabilities. we use for/next, so we can change # the "indices" variable, where we remove already sampled id's for (p in proportion) { # training-id's, sampled from id's per group - size is % within each group training <- sort(sample(indices, round(n * p))) # remove already sampled id's from group-indices indices <- setdiff(indices, training) # each training set data frame as one list element d[[length(d) + 1]] <- data[training, ] } d }) # we need to move all list elements one level higher. if (is.null(by)) { training_sets <- training_sets[[1]] } else { # for grouped training sets, we need to row-bind all sampled training # sets from each group. currently, we have a list of data frames, # grouped by "group"; but we want one data frame per proportion that # contains sampled rows from all groups. training_sets <- lapply(seq_along(proportion), function(p) { do.call(rbind, lapply(training_sets, function(i) i[[p]])) }) } # use probabilies as element names names(training_sets) <- sprintf("p_%g", proportion) # remove all training set id's from data, add remaining data (= test set) all_ids <- lapply( training_sets, data_extract, select = row_id, as_data_frame = FALSE ) out <- c( training_sets, list(test = data[-unlist(all_ids, use.names = FALSE), ]) ) lapply(out, `row.names<-`, NULL) } ================================================ FILE: R/data_peek.R ================================================ #' @title Peek at values and type of variables in a data frame #' @name data_peek #' #' @description This function creates a table a data frame, showing all #' column names, variable types and the first values (as many as fit into #' the screen). #' #' @param x A data frame. #' @param width Maximum width of line length to display. If `NULL`, width will #' be determined using `options()$width`. #' @param ... not used. #' @inheritParams extract_column_names #' #' @note To show only specific or a limited number of variables, use the #' `select` argument, e.g. `select = 1:5` to show only the first five variables. #' #' @return A data frame with three columns, containing information about #' the name, type and first values of the input data frame. #' #' @examples #' data(efc) #' data_peek(efc) #' # show variables two to four #' data_peek(efc, select = 2:4) #' @export data_peek <- function(x, ...) { UseMethod("data_peek") } #' @rdname data_peek #' @export data_peek.data.frame <- function( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, width = NULL, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) out <- do.call( rbind, lapply(select, function(i) { .data_peek(x, i, width, verbose = verbose, ...) }) ) class(out) <- c("dw_data_peek", class(out)) attr(out, "n_cols") <- ncol(x) attr(out, "n_rows") <- nrow(x) attr(out, "max_width") <- ifelse(is.null(width), 0.9 * options()$width, width) out } # methods ----------------- #' @export print.dw_data_peek <- function(x, ...) { x <- format(x, ...) caption <- sprintf( "Data frame with %i rows and %i variables", attributes(x)$n_rows, attributes(x)$n_cols ) cat(insight::export_table(x, align = "lll", caption = caption, ...)) } #' @export print_md.dw_data_peek <- function(x, ...) { x <- format(x, ...) caption <- sprintf( "Data frame with %i rows and %i variables", attributes(x)$n_rows, attributes(x)$n_cols ) insight::export_table( x, align = "lll", format = "markdown", caption = caption, ... ) } #' @export print_html.dw_data_peek <- function(x, ...) { x <- format(x, ...) caption <- sprintf( "Data frame with %i rows and %i variables", attributes(x)$n_rows, attributes(x)$n_cols ) insight::export_table( x, align = "lll", format = "html", caption = caption, ... ) } #' @export format.dw_data_peek <- function(x, ...) { width_col1 <- max(nchar(x$Variable)) width_col2 <- max(nchar(x$Type)) max_width <- attributes(x)$max_width if (is.null(max_width)) { max_width <- 0.9 * options()$width } width_col3 <- max_width - (width_col1 + width_col2 + 10) # 10 = separator chars in table # shorten value-string x$Values <- substr(x$Values, 0, width_col3) # make sure we have a clear truncation, at last "comma" x$Values <- gsub("(.+)(,.+)$", "\\1", x$Values) # add "..." x$Values <- paste0(x$Values, ", ...") x } # helper ----------------- .data_peek <- function(x, variable, width = NULL, verbose = TRUE, ...) { v_name <- variable v_type <- .variable_type(x[[variable]]) v_type[v_type == "categorical"] <- "factor" max_width <- ifelse(is.null(width), 0.9 * options()$width, width) v_values <- toString(x[[variable]][1:max_width]) data.frame( Variable = v_name, Type = v_type, Values = v_values, stringsAsFactors = FALSE ) } ================================================ FILE: R/data_read.R ================================================ #' @title Read (import) data files from various sources #' @name data_read #' #' @description #' This functions imports data from various file types. It is a small wrapper #' around `haven::read_spss()`, `haven::read_stata()`, `haven::read_sas()`, #' `readxl::read_excel()` and `data.table::fread()` resp. `readr::read_delim()` #' (the latter if package **data.table** is not installed). Thus, supported file #' types for importing data are data files from SPSS, SAS or Stata, Excel files #' or text files (like '.csv' files). All other file types are passed to #' `rio::import()`. `data_write()` works in a similar way. #' #' @param path Character string, the file path to the data file. #' @param path_catalog Character string, path to the catalog file. Only relevant #' for SAS data files. #' @param encoding The character encoding used for the file. Usually not needed. #' @param convert_factors If `TRUE` (default), numeric variables, where all #' values have a value label, are assumed to be categorical and converted into #' factors. If `FALSE`, no variable types are guessed and no conversion of #' numeric variables into factors will be performed. For `data_read()`, this #' argument only applies to file types with *labelled data*, e.g. files from #' SPSS, SAS or Stata. See also section 'Differences to other packages'. For #' `data_write()`, this argument only applies to the text (e.g. `.txt` or #' `.csv`) or spreadsheet file formats (like `.xlsx`). Converting to factors #' might be useful for these formats because labelled numeric variables are then #' converted into factors and exported as character columns - else, value labels #' would be lost and only numeric values are written to the file. #' @param password Password for data encryption. If not `NULL`, the data will be #' encrypted (for `data_write()`) or decrypted (for `data_read()`) using the #' provided password. Encryption is currently only supported for R file formats #' (`.rds`, `.rda` and `.rdata`). See the section "Data encryption" below for more #' information on the encryption method used. #' @param verbose Toggle warnings and messages. #' @param ... Arguments passed to the related `read_*()` or `write_*()` functions. #' #' @return A data frame. #' #' @section Supported file types: #' - `data_read()` is a wrapper around the **haven**, **data.table**, **readr** #' **readxl**, **nanoparquet** and **rio** packages. Currently supported file #' types are `.txt`, `.csv`, `.xls`, `.xlsx`, `.sav`, `.por`, `.dta`, `.sas`, #' `.rda`, `.parquet`, `.rdata`, and `.rds` (and related files). All other file #' types are passed to `rio::import()`. #' - `data_write()` is a wrapper around **haven**, **readr**, **nanoparquet**, #' and **rio** packages, and supports writing files into all formats supported #' by these packages. #' #' @section Compressed files (zip) and URLs: #' `data_read()` can also read the above mentioned files from URLs or from #' inside zip-compressed files. Thus, `path` can also be a URL to a file like #' `"http://www.url.com/file.csv"`. When `path` points to a zip-compressed file, #' and there are multiple files inside the zip-archive, then the first supported #' file is extracted and loaded. #' #' @section General behaviour: #' `data_read()` detects the appropriate `read_*()` function based on the #' file-extension of the data file. Thus, in most cases it should be enough to #' only specify the `path` argument. However, if more control is needed, all #' arguments in `...` are passed down to the related `read_*()` function. The #' same applies to `data_write()`, i.e. based on the file extension provided in #' `path`, the appropriate `write_*()` function is used automatically. #' #' @section SPSS specific behaviour: #' `data_read()` does *not* import user-defined ("tagged") `NA` values from #' SPSS, i.e. argument `user_na` is always set to `FALSE` when importing SPSS #' data with the **haven** package. Use `convert_to_na()` to define missing #' values in the imported data, if necessary. Furthermore, `data_write()` #' compresses SPSS files by default. If this causes problems with (older) SPSS #' versions, use `compress = "none"`, for example #' `data_write(data, "myfile.sav", compress = "none")`. #' #' @section Differences to other packages that read foreign data formats: #' `data_read()` is most comparable to `rio::import()`. For data files from #' SPSS, SAS or Stata, which support labelled data, variables are converted into #' their most appropriate type. The major difference to `rio::import()` is for #' data files from SPSS, SAS, or Stata, i.e. file types that support #' *labelled data*. `data_read()` automatically converts fully labelled numeric #' variables into factors, where imported value labels will be set as factor #' levels. If a numeric variable has _no_ value labels or less value labels than #' values, it is not converted to factor. In this case, value labels are #' preserved as `"labels"` attribute. Character vectors are preserved. Use #' `convert_factors = FALSE` to remove the automatic conversion of numeric #' variables to factors. #' #' @section Data encryption: #' `data_read()` and `data_write()` support data encryption for R file formats #' (`.rds`, `.rda` and `.rdata`). To encrypt a file, provide a password to the #' `password` argument in `data_write()`. To decrypt the file, provide the same #' password to `data_read()`. The encryption is based on the **openssl** package #' and uses the AES-GCM algorithm (see `?openssl::aes_gcm_encrypt`) with a #' 256-bit key (see `?openssl::sha256`). Thus, data can also be decrypted without #' relying on the **datawizard** package, e.g. using following code: #' #' ``` #' encrypted_data <- readRDS(datafile) #' key <- openssl::sha256(charToRaw("")) #' out <- openssl::aes_gcm_decrypt(encrypted_data, key = key) #' decrypted_data <- unserialize(out) #' ``` #' #' **Warning:** Do not lose your `password`, else you will not be able to #' decrypt the data again! #' #' @export data_read <- function( path, path_catalog = NULL, encoding = NULL, convert_factors = TRUE, password = NULL, verbose = TRUE, ... ) { # extract first valid file from zip-file if (identical(.file_ext(path), "zip")) { path <- .extract_zip(path) } # check for valid file type file_type <- .file_ext(path) if (!is.character(file_type) || file_type == "") { insight::format_error( "Could not detect file type. The `path` argument has no file extension.", "Please provide a file path including extension, like \"myfile.csv\" or \"c:/Users/Default/myfile.sav\"." ) } # read data out <- switch( file_type, txt = , csv = .read_text(path, encoding, verbose, ...), rda = , rdata = .read_base_rda(path, file_type, password, verbose, ...), rds = .read_base_rds(path, password, verbose, ...), xls = , xlsx = .read_excel(path, encoding, verbose, ...), sav = , por = .read_spss(path, encoding, convert_factors, verbose, ...), dta = .read_stata(path, encoding, convert_factors, verbose, ...), sas7bdat = .read_sas( path, path_catalog, encoding, convert_factors, verbose, ... ), parquet = .read_parquet(path, verbose, ...), .read_unknown(path, file_type, verbose, ...) ) # tell user about empty columns if (verbose) { empty_cols <- empty_columns(out) # only message if we actually have empty columns if (length(empty_cols)) { insight::format_alert( sprintf("Following %i variables are empty:", length(empty_cols)), text_concatenate(names(empty_cols)), "\nUse `remove_empty_columns()` to remove them from the data frame." ) } } out } # helper ----------------------- .file_ext <- function(x) { pos <- regexpr("\\.([[:alnum:]]+)$", x) ifelse(pos > -1L, tolower(substring(x, pos + 1L)), "") } .extract_zip <- function(path) { files <- utils::unzip(path, list = TRUE) files_ext <- vapply(files$Name, .file_ext, FUN.VALUE = character(1L)) supported_filetypes <- c("txt", "csv", "xls", "xlsx", "sav", "por", "dta") dest <- files$Name[which(files_ext %in% supported_filetypes)] if (length(dest) > 0) { d <- tempfile() dir.create(d) utils::unzip(path, exdir = d) path <- file.path(d, dest[1]) } else { insight::format_error( "The zip-file does not contain any supported file types." ) } path } # process imported data from SPSS, SAS or Stata ----------------------- .post_process_imported_data <- function(x, convert_factors, verbose) { # user may decide whether we automatically detect variable type or not if (isTRUE(convert_factors)) { if (verbose) { msg <- "Variables where all values have associated labels are now converted into factors. If this is not intended, use `convert_factors = FALSE`." # nolint insight::format_alert(msg) } x[] <- lapply(x, function(i) { # only proceed if not all missing if (!all(is.na(i))) { # save labels value_labels <- attr(i, "labels", exact = TRUE) variable_labels <- attr(i, "label", exact = TRUE) # filter, so only matching value labels remain value_labels <- value_labels[value_labels %in% unique(i)] # guess variable type if (is.character(i)) { # we need this to drop haven-specific class attributes i <- as.character(i) } else if ( !is.null(value_labels) && length(value_labels) == insight::n_unique(i) ) { # if all values are labelled, we assume factor. Use labels as levels if (is.numeric(i)) { i <- factor(i, labels = names(value_labels)) } else { i <- factor(as.character(i), labels = names(value_labels)) } value_labels <- NULL attr(i, "converted_to_factor") <- TRUE } else { # else, fall back to numeric or factor i <- as.numeric(i) } # drop unused value labels value_labels <- value_labels[value_labels %in% unique(i)] if (length(value_labels) > 0L) { attr(i, "labels") <- value_labels } # add back variable label attr(i, "label") <- variable_labels } i }) # tell user how many variables were converted if (verbose) { cnt <- sum(vapply( x, function(i) isTRUE(attributes(i)$converted_to_factor), TRUE )) msg <- sprintf( "%i out of %i variables were fully labelled and converted into factors.", cnt, ncol(x) ) insight::format_alert(msg) } } else { # drop haven class attributes x[] <- lapply(x, function(i) { # save labels class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr")) i }) } class(x) <- "data.frame" x } # read functions ----------------------- .read_spss <- function(path, encoding, convert_factors, verbose, ...) { insight::check_if_installed( "haven", reason = paste0("to read files of type '", .file_ext(path), "'") ) if (verbose) { insight::format_alert("Reading data...") } out <- haven::read_sav(file = path, encoding = encoding, user_na = FALSE, ...) .post_process_imported_data(out, convert_factors, verbose) } .read_stata <- function(path, encoding, convert_factors, verbose, ...) { insight::check_if_installed( "haven", reason = paste0("to read files of type '", .file_ext(path), "'") ) if (verbose) { insight::format_alert("Reading data...") } out <- haven::read_dta(file = path, encoding = encoding, ...) .post_process_imported_data(out, convert_factors, verbose) } .read_sas <- function( path, path_catalog, encoding, convert_factors, verbose, ... ) { insight::check_if_installed( "haven", reason = paste0("to read files of type '", .file_ext(path), "'") ) if (verbose) { insight::format_alert("Reading data...") } out <- haven::read_sas( data_file = path, catalog_file = path_catalog, encoding = encoding, ... ) .post_process_imported_data(out, convert_factors, verbose) } .read_excel <- function(path, encoding, verbose, ...) { insight::check_if_installed( "readxl", reason = paste0("to read files of type '", .file_ext(path), "'") ) if (verbose) { insight::format_alert("Reading data...") } out <- readxl::read_excel(path, ...) class(out) <- "data.frame" out } .read_text <- function(path, encoding, verbose, ...) { if (insight::check_if_installed("data.table", quietly = TRUE)) { # set proper default encoding-value for fread if (is.null(encoding)) { encoding <- "unknown" } out <- data.table::fread(input = path, encoding = encoding, ...) return(as.data.frame(out)) } insight::check_if_installed( "readr", reason = paste0("to read files of type '", .file_ext(path), "'") ) if (verbose) { insight::format_alert("Reading data...") } out <- readr::read_delim(path, ...) as.data.frame(out) } .read_unknown <- function(path, file_type, verbose, ...) { insight::check_if_installed( "rio", reason = paste0("to read files of type '", file_type, "'") ) if (verbose) { insight::format_alert("Reading data...") } # set up arguments. for RDS, we set trust = TRUE, to avoid warnings rio_args <- list(file = path) # check if we have RDS, and if so, add trust = TRUE if (file_type %in% c("rds", "rdata", "rda")) { rio_args$trust <- TRUE } out <- do.call(rio::import, c(rio_args, list(...))) # check if loaded file is a data frame, or not (e.g. model objects) # it returns `NULL` if the file is no valid data file that contains a data # frame.frame, or cannot be coerced to a data frame. Else, if it was a data # frame or could be coerced into one, the (new) data frame is returned. In # this case, we overwrite "out", else we keep its original object. valid_data_object <- .get_data_from_loaded_file(out, verbose) # if file could be coerced to a data frame, overwrite out if (!is.null(valid_data_object)) { out <- valid_data_object } out } .read_base_rda <- function(path, file_type, password, verbose = TRUE, ...) { if (verbose) { insight::format_alert("Reading data...") } # check URLs path <- .check_path_url(path, file_type) # since RData and rda can keep multiple files, we load them into a # new environment and return them as list object then env <- new.env() load(file = path, envir = env) # if the RData file contains more than one object, we don't check the output # but just return everything if (length(ls(env)) > 1) { if (verbose) { insight::format_alert( "File contained more than one object, returning all objects." ) } return(as.list(env)) } # else, retrieve loaded object out <- get(ls(env)[1], env) # data decryption out <- .data_decryption(out, password) # check if loaded file is a data frame, or not (e.g. model objects) # it returns `NULL` if the file is no valid data file that contains a data # frame.frame, or cannot be coerced to a data frame. Else, if it was a data # frame or could be coerced into one, the (new) data frame is returned. In # this case, we overwrite "out", else we keep its original object. valid_data_object <- .get_data_from_loaded_file(out, verbose) # if file could be coerced to a data frame, overwrite out if (!is.null(valid_data_object)) { out <- valid_data_object } out } .read_base_rds <- function(path, password, verbose = TRUE, ...) { if (verbose) { insight::format_alert("Reading data...") } # check URLs path <- .check_path_url(path, file_type = "rds") out <- readRDS(file = path) # data decryption out <- .data_decryption(out, password) # check if loaded file is a data frame, or not (e.g. model objects) # it returns `NULL` if the file is no valid data file that contains a data # frame.frame, or cannot be coerced to a data frame. Else, if it was a data # frame or could be coerced into one, the (new) data frame is returned. In # this case, we overwrite "out", else we keep its original object. valid_data_object <- .get_data_from_loaded_file(out, verbose) # if file could be coerced to a data frame, overwrite out if (!is.null(valid_data_object)) { out <- valid_data_object } out } .read_parquet <- function(path, verbose = TRUE, ...) { # requires nanoparquet package insight::check_if_installed("nanoparquet") if (verbose) { insight::format_alert("Reading data...") } # check URLs path <- .check_path_url(path, file_type = "parquet") out <- nanoparquet::read_parquet(file = path, ...) as.data.frame(out) } # check input helper -------------------------------------------------------- # for URLs, we need to download the file and save it locally .check_path_url <- function(path, file_type) { url_pattern <- "^(https?|ftp)://(.*)" # check if file path is an URL if (grepl(url_pattern, path)) { insight::check_if_installed("curl") if (curl::has_internet()) { # if yes, create temp file and save file locally temp_file <- tempfile(fileext = paste0(".", file_type)) download <- curl::curl_fetch_memory(path) writeBin(object = download$content, con = temp_file) # return path to temp file path <- temp_file } else { insight::format_error( "No internet connection detected. Could not download file from URL." ) } } path } .get_data_from_loaded_file <- function(out, verbose = TRUE) { # it is also possible to read in pre-compiled model objects with data_read() # in this case, just return as is. We do this check before we check with # "is.data.frame()", because some models (like brmsfit) have an `as.data.frame()` # method, which coerces the model object into a data frame, which is likely to # be not intentional if (insight::is_model(out)) { if (verbose) { insight::format_alert( paste0( "Imported file is a regression model object of class \"", class(out)[1], "\"." ), "Returning file as is." ) } return(NULL) } # for "unknown" data formats (like .RDS), which still can be imported via # "rio::import()", we must check whether we actually have a data frame or # not. Else, tell user. if (!is.data.frame(out)) { tmp <- tryCatch( as.data.frame(out, stringsAsFactors = FALSE), error = function(e) NULL ) if (is.null(tmp)) { if (verbose) { insight::format_warning( paste0( "Imported file is no data frame, but of class \"", class(out)[1], "\"." ), "Returning file as is. Please check if importing this file was intended." ) } return(NULL) } out <- tmp } out } # decrypt data --------------------------------- .data_decryption <- function(data, password = NULL) { # check if data should be decrypted if (!is.null(password)) { .validate_password(password) data <- .decrypt_data(data, password) } data } .decrypt_data <- function(data, password = NULL) { insight::check_if_installed("openssl", "for data decryption") # it is important to remember the phrase! else, you cannot decrypt the data passphrase <- charToRaw(password) key <- openssl::sha256(passphrase) # decrypt the data. in case of wrong password, `unserialize()` errors out <- tryCatch( unserialize(openssl::aes_gcm_decrypt(data, key = key)), error = function(e) NULL ) # check if we had encrypted data at all? if (is.null(out)) { insight::format_error( "File does not appear to be encrypted with {datawizard}, or you provided the wrong password." ) } out } ================================================ FILE: R/data_relocate.R ================================================ #' @title Relocate (reorder) columns of a data frame #' @name data_relocate #' #' @description #' `data_relocate()` will reorder columns to specific positions, indicated by #' `before` or `after`. `data_reorder()` will instead move selected columns to #' the beginning of a data frame. Finally, `data_remove()` removes columns #' from a data frame. All functions support select-helpers that allow flexible #' specification of a search pattern to find matching columns, which should #' be reordered or removed. #' #' @param data A data frame. #' @param before,after Destination of columns. Supplying neither will move #' columns to the left-hand side; specifying both is an error. Can be a #' character vector, indicating the name of the destination column, or a #' numeric value, indicating the index number of the destination column. #' If `-1`, will be added before or after the last column. #' @inheritParams extract_column_names #' @inheritParams data_rename #' #' @inherit data_rename seealso #' #' @return A data frame with reordered columns. #' #' @examples #' # Reorder columns #' head(data_relocate(iris, select = "Species", before = "Sepal.Length")) #' head(data_relocate(iris, select = "Species", before = "Sepal.Width")) #' head(data_relocate(iris, select = "Sepal.Width", after = "Species")) #' # which is same as #' head(data_relocate(iris, select = "Sepal.Width", after = -1)) #' #' # Reorder multiple columns #' head(data_relocate(iris, select = c("Species", "Petal.Length"), after = "Sepal.Width")) #' # which is same as #' head(data_relocate(iris, select = c("Species", "Petal.Length"), after = 2)) #' #' # Reorder columns #' head(data_reorder(iris, c("Species", "Sepal.Length"))) #' #' @export data_relocate <- function( data, select, before = NULL, after = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # Sanitize if (!is.null(before) && !is.null(after)) { insight::format_error("You must supply only one of `before` or `after`.") } # allow numeric values if (!is.null(before) && is.numeric(before)) { if (before == -1) { before <- names(data)[ncol(data)] } else if (before >= 1 && before <= ncol(data)) { before <- names(data)[before] } else { insight::format_error("No valid position defined in `before`.") } } # allow numeric values if (!is.null(after) && is.numeric(after)) { if (after == -1) { after <- names(data)[ncol(data)] } else if (after >= 1 && after <= ncol(data)) { after <- names(data)[after] } else { insight::format_error("No valid position defined in `after`.") } } cols <- .select_nse( select, data, exclude = NULL, ignore_case = ignore_case, regex = regex, verbose = verbose ) # save attributes attr_data <- attributes(data) # Move columns to the right hand side data <- data[c(setdiff(names(data), cols), cols)] # Get columns and their original position data_cols <- names(data) position <- which(data_cols %in% cols) # remember original values, for more informative messages original_before <- before original_after <- after # Find new positions # nolint start if (!is.null(before)) { before <- before[before %in% data_cols][1] # Take first that exists (if vector is supplied) if (length(before) != 1 || is.na(before)) { # guess the misspelled column insight::format_error( "The column passed to `before` wasn't found.", .misspelled_string( data_cols, original_before[1], default_message = "Possibly misspelled?" ) ) } where <- min(match(before, data_cols)) position <- c(setdiff(position, where), where) } else if (!is.null(after)) { after <- after[after %in% data_cols][1] # Take first that exists (if vector is supplied) if (length(after) != 1 || is.na(after)) { # guess the misspelled column insight::format_error( "The column passed to `after` wasn't found.", .misspelled_string( data_cols, original_after[1], default_message = "Possibly misspelled?" ) ) } where <- max(match(after, data_cols)) position <- c(where, setdiff(position, where)) } else { where <- 1 position <- union(position, where) } # nolint end # Set left and right side lhs <- setdiff(seq(1, where - 1), position) rhs <- setdiff(seq(where + 1, ncol(data)), position) position <- unique(c(lhs, position, rhs)) position <- position[position <= length(data_cols)] out <- data[position] out <- .replace_attrs(out, attr_data) out } #' @rdname data_relocate #' @export data_reorder <- function( data, select, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { cols <- .select_nse( select, data, exclude = NULL, ignore_case = ignore_case, regex = regex, verbose = verbose ) remaining_columns <- setdiff(colnames(data), cols) out <- data[c(cols, remaining_columns)] out <- .replace_attrs(out, attributes(data)) out } ================================================ FILE: R/data_remove.R ================================================ #' @inheritParams extract_column_names #' @rdname data_relocate #' @examples #' # Remove columns #' head(data_remove(iris, "Sepal.Length")) #' head(data_remove(iris, starts_with("Sepal"))) #' @export data_remove <- function( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = FALSE, ... ) { ## TODO set verbose = TRUE by default in a later update? # evaluate arguments select <- .select_nse( select, data, exclude = NULL, ignore_case = ignore_case, regex = regex, verbose = verbose ) # nothing to remove? if (!length(select)) { return(data) } out <- data[!colnames(data) %in% select] out <- .replace_attrs(out, attributes(data)) out } ================================================ FILE: R/data_rename.R ================================================ #' @title Rename columns and variable names #' @name data_rename #' #' @description Safe and intuitive functions to rename variables or rows in #' data frames. `data_rename()` will rename column names, i.e. it facilitates #' renaming variables. `data_rename_rows()` is a convenient shortcut #' to add or rename row names of a data frame, but unlike `row.names()`, its #' input and output is a data frame, thus, integrating smoothly into a #' possible pipe-workflow. #' #' @inheritParams extract_column_names #' @param data A data frame. #' @param replacement Character vector. Can be one of the following: #' - A character vector that indicates the new names of the columns selected #' in `select`. `select` and `replacement` must be of the same length. #' - A string (i.e. character vector of length 1) with a "glue" styled #' pattern. Currently supported tokens are: #' - `{col}` which will be replaced by the column name, i.e. the #' corresponding value in `select`. #' - `{n}` will be replaced by the number of the variable that is replaced. #' - `{letter}` will be replaced by alphabetical letters in sequential #' order. #' If more than 26 letters are required, letters are repeated, but have #' sequential numeric indices (e.g., `a1` to `z1`, followed by `a2` to #' `z2`). #' - Finally, the name of a user-defined object that is available in the #' environment can be used. Note that the object's name is not allowed to #' be one of the pre-defined tokens, `"col"`, `"n"` and `"letter"`. #' #' An example for the use of tokens is... #' ```r #' data_rename( #' mtcars, #' select = c("am", "vs"), #' replacement = "new_name_from_{col}" #' ) #' ``` #' ... which would return new column names `new_name_from_am` and #' `new_name_from_vs`. See 'Examples'. #' #' If `select` is a named vector, `replacement` is ignored. #' @param rows Vector of row names. #' @param ... Other arguments passed to or from other functions. #' #' @details #' `select` can also be a named character vector. In this case, the names are #' used to rename the columns in the output data frame. If you have a named #' list, use `unlist()` to convert it to a named vector. See 'Examples'. #' #' @return A modified data frame. #' #' @examples #' # Rename columns #' head(data_rename(iris, "Sepal.Length", "length")) #' #' # Use named vector to rename #' head(data_rename(iris, c(length = "Sepal.Length", width = "Sepal.Width"))) #' #' # Change all #' head(data_rename(iris, replacement = paste0("Var", 1:5))) #' #' # Use glue-styled patterns #' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "formerly_{col}")) #' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "{col}_is_column_{n}")) #' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "new_{letter}")) #' #' # User-defined glue-styled patterns from objects in environment #' x <- c("hi", "there", "!") #' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}")) #' @seealso #' - Add a prefix or suffix to column names: [data_addprefix()], [data_addsuffix()] #' - Functions to reorder or remove columns: [data_reorder()], [data_relocate()], #' [data_remove()] #' - Functions to reshape, pivot or rotate data frames: [data_to_long()], #' [data_to_wide()], [data_rotate()] #' - Functions to recode data: [rescale()], [reverse()], [categorize()], #' [recode_values()], [slide()] #' - Functions to standardize, normalize, rank-transform: [center()], [standardize()], #' [normalize()], [ranktransform()], [winsorize()] #' - Split and merge data frames: [data_partition()], [data_merge()] #' - Functions to find or select columns: [data_select()], [extract_column_names()] #' - Functions to filter rows: [data_match()], [data_filter()] #' #' @export data_rename <- function(data, select = NULL, replacement = NULL, ...) { # check for valid input if (!is.data.frame(data)) { insight::format_error("Argument `data` must be a data frame.") } # change all names if no pattern specified select <- .select_nse( select, data, exclude = NULL, ignore_case = NULL, regex = NULL, allow_rename = TRUE, ifnotfound = "error" ) # Forbid partially named "select", # Ex: if select = c("foo" = "Species", "Sepal.Length") then the 2nd name and # 2nd value are "Sepal.Length" if (!is.null(names(select)) && any(names(select) == select)) { insight::format_error( "When `select` is a named vector, all elements must be named." ) } # check if `select` has names, and if so, use as "replacement" if (!is.null(names(select))) { replacement <- names(select) } # coerce to character replacement <- as.character(replacement) # check if `replacement` has no empty strings and no NA values invalid_replacement <- is.na(replacement) | !nzchar(replacement) if (any(invalid_replacement)) { if (is.null(names(select))) { # when user did not match `select` with `replacement` msg <- c( "`replacement` is not allowed to have `NA` or empty strings.", sprintf( "Following values in `select` have no match in `replacement`: %s", toString(select[invalid_replacement]) ) ) } else { # when user did not name all elements of `select` msg <- c( "Either name all elements of `select` or use `replacement`.", sprintf( "Following values in `select` were not named: %s", toString(select[invalid_replacement]) ) ) } insight::format_error(msg) } # if duplicated names in replacement, append ".2", ".3", etc. to duplicates # ex: c("foo", "foo") -> c("foo", "foo.2") if (anyDuplicated(replacement) > 0L) { dup <- as.data.frame(table(replacement)) dup <- dup[dup$Freq > 1, ] for (i in dup$replacement) { to_replace <- which(replacement == i)[-1] new_replacement <- paste0(i, ".", 1 + seq_along(to_replace)) replacement[to_replace] <- new_replacement } } # check if we have "glue" styled replacement-string glue_style <- length(replacement) == 1 && grepl("{", replacement, fixed = TRUE) if (length(replacement) > length(select)) { insight::format_error( "There are more names in `replacement` than in `select`." ) } else if (length(replacement) < length(select) && !glue_style) { insight::format_error( "There are more names in `select` than in `replacement`" ) } # if we have glue-styled replacement-string, create replacement select now if (glue_style) { replacement <- .glue_replacement(select, replacement) } for (i in seq_along(select)) { if (!is.na(replacement[i])) { data <- .data_rename(data, select[i], replacement[i]) } } data } #' @keywords internal .data_rename <- function(data, pattern, replacement) { if (!pattern %in% names(data)) { insight::format_error(paste0( "Variable `", pattern, "` is not in your data frame :/" )) } names(data) <- replace(names(data), names(data) == pattern, replacement) data } .glue_replacement <- function(pattern, replacement) { # this function replaces "glue" tokens into their related # real names/values. Currently, following tokens are accepted: # - {col}: replacement is the name of the column (indicated in "pattern") # - {letter}: replacement is lower-case alphabetically letter, in sequential order # - {n}: replacement is the number of the variable out of n, that should be renamed out <- rep_len("", length(pattern)) # for alphabetical letters, we prepare a string if we have more than # 26 columns to rename if (length(out) > 26) { long_letters <- paste0( rep.int(letters[1:26], times = ceiling(length(out) / 26)), rep(1:ceiling(length(out) / 26), each = 26) ) } else { long_letters <- letters[1:26] } long_letters <- long_letters[seq_along(out)] for (i in seq_along(out)) { # prepare pattern column_name <- pattern[i] out[i] <- replacement # replace first pre-defined token out[i] <- gsub( "(.*)(\\{col\\})(.*)", replacement = paste0("\\1", column_name, "\\3"), x = out[i] ) # replace second pre-defined token out[i] <- gsub( "(.*)(\\{n\\})(.*)", replacement = paste0("\\1", i, "\\3"), x = out[i] ) # replace third pre-defined token out[i] <- gsub( "(.*)(\\{letter\\})(.*)", replacement = paste0("\\1", long_letters[i], "\\3"), x = out[i] ) # extract all non-standard tokens matches <- unlist( regmatches(out[i], gregexpr("\\{([^}]*)\\}", out[i])), use.names = FALSE ) # do we have any additional tokens, i.e. variable names from the environment? # users can also specify variable names, where the if (length(matches)) { # if so, iterate all tokens for (token in matches) { # evaluate token-object from the environment values <- .dynEval( str2lang(gsub("\\{(.*)\\}", "\\1", token)), ifnotfound = insight::format_error(paste0( "The object `", token, "` was not found. Please check if it really exists." )) ) # check for correct length if (length(values) != length(pattern)) { insight::format_error(paste0( "The number of values provided in `", token, "` (", length(values), " values) do not match the number of columns to rename (", length(pattern), " columns)." )) } # replace token with values from the object if (length(values)) { out[i] <- gsub(token, values[i], out[i], fixed = TRUE) } } } } out } # Row.names ---------------------------------------------------------------- #' @rdname data_rename #' @export data_rename_rows <- function(data, rows = NULL) { row.names(data) <- rows data } ================================================ FILE: R/data_replicate.R ================================================ #' @title Expand (i.e. replicate rows) a data frame #' @name data_replicate #' #' @description #' Expand a data frame by replicating rows based on another variable that #' contains the counts of replications per row. #' #' @param data A data frame. #' @param expand The name of the column that contains the counts of replications #' for each row. Can also be a numeric value, indicating the position of that #' column. Note that the variable indicated by `expand` must be an integer vector. #' @param remove_na Logical. If `TRUE`, missing values in the column #' provided in `expand` are removed from the data frame. If `FALSE` and `expand` #' contains missing values, the function will throw an error. #' @param ... Currently not used. #' @inheritParams extract_column_names #' #' @return A dataframe with each row replicated as many times as defined in `expand`. #' #' @examples #' data(mtcars) #' data_replicate(head(mtcars), "carb") #' @export data_replicate <- function( data, expand = NULL, select = NULL, exclude = NULL, remove_na = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) { # we need a name for the new column if (is.null(expand)) { insight::format_error( "No column that should be used to expand the data frame was provided. Please use `expand` to define a column." ) } # only one column name if (length(expand) > 1) { insight::format_error( "Please provide only a single string for `expand`, no character vector with multiple values." ) } # check if numerics, and if so, use column name if (is.numeric(expand)) { expand <- colnames(data)[expand] } # check if in data if (!expand %in% colnames(data)) { insight::format_error( "The column provided in `expand` does not exist in the data frame.", .misspelled_string(colnames(data), expand, "Possibly misspelled?") ) } # check that "expand" contains no Inf if (any(is.infinite(data[[expand]]))) { insight::format_error( "The column provided in `expand` contains infinite values. Please provide a column that does not contain infinite values." # nolint ) } # check that "expand" is integer if (!.is_integer(data[[expand]])) { insight::format_error( "The column provided in `expand` is not of type integer. Please provide a column that contains integer values." # nolint ) } # evaluate select/exclude, may be select-helpers select <- .select_nse( select, data, exclude, ignore_case, regex = regex, verbose = verbose ) # extract variable that contains the counts of replicates replicates <- data[[expand]] # we can remove that column now data[[expand]] <- NULL # also remove "expand" from "select" string select <- setdiff(select, expand) # if user doesn't want to remove "NA", but replicates contain "NA", # give informative error here if (!remove_na && anyNA(replicates)) { insight::format_error( "The column provided in `expand` contains missing values, but `remove_na` is set to `FALSE`.", "Please set `remove_na` to `TRUE` or remove the missing values from the `expand` variable." ) } # remove rows where "expand" is NA data <- data[!is.na(replicates), , drop = FALSE] replicates <- replicates[!is.na(replicates)] # fin as.data.frame(do.call( cbind, lapply(data[select], rep.int, times = replicates) )) } # is.integer(c(1, 2)) -> FALSE # all(c(1, 2) %% 1 == 0) -> TRUE .is_integer <- function(x, remove_na = TRUE) { if (remove_na) { x <- x[!is.na(x)] } tryCatch( all(x %% 1 == 0), warning = function(w) is.integer(x), error = function(e) FALSE ) } ================================================ FILE: R/data_rescale.R ================================================ #' @title Rescale Variables to a New Range #' @name rescale #' #' @description #' Rescale variables to a new range. Can also be used to reverse-score variables #' (change the keying/scoring direction), or to expand a range. #' #' @inheritParams categorize #' @inheritParams extract_column_names #' @inheritParams standardize.data.frame #' @param to Numeric vector of length 2 giving the new range that the variable #' will have after rescaling. To reverse-score a variable, the range should #' be given with the maximum value first. See examples. #' @param multiply If not `NULL`, `to` is ignored and `multiply` will be used, #' giving the factor by which the actual range of `x` should be expanded. #' For example, if a vector ranges from 5 to 15 and `multiply = 1.1`, the current #' range of 10 will be expanded by the factor of 1.1, giving a new range of #' 11. Thus, the rescaled vector would range from 4.5 to 15.5. #' @param add A vector of length 1 or 2. If not `NULL`, `to` is ignored and `add` #' will be used, giving the amount by which the minimum and maximum of the #' actual range of `x` should be expanded. For example, if a vector ranges from #' 5 to 15 and `add = 1`, the range will be expanded from 4 to 16. If `add` is #' of length 2, then the first value is used for the lower bound and the second #' value for the upper bound. #' @param range Initial (old) range of values. If `NULL`, will take the range of #' the input vector (`range(x)`). #' @param ... Arguments passed to or from other methods. #' #' @inheritSection center Selection of variables - the `select` argument #' #' @seealso See [makepredictcall.dw_transformer()] for use in model formulas. #' @family transform utilities #' #' @return A rescaled object. #' #' @examples #' rescale(c(0, 1, 5, -5, -2)) #' rescale(c(0, 1, 5, -5, -2), to = c(-5, 5)) #' rescale(c(1, 2, 3, 4, 5), to = c(-2, 2)) #' #' # Specify the "theoretical" range of the input vector #' rescale(c(1, 3, 4), to = c(0, 40), range = c(0, 4)) #' #' # Reverse-score a variable #' rescale(c(1, 2, 3, 4, 5), to = c(5, 1)) #' rescale(c(1, 2, 3, 4, 5), to = c(2, -2)) #' #' # Data frames #' head(rescale(iris, to = c(0, 1))) #' head(rescale(iris, to = c(0, 1), select = "Sepal.Length")) #' #' # One can specify a list of ranges #' head(rescale(iris, to = list( #' "Sepal.Length" = c(0, 1), #' "Petal.Length" = c(-1, 0) #' ))) #' #' # "expand" ranges by a factor or a given value #' x <- 5:15 #' x #' # both will expand the range by 10% #' rescale(x, multiply = 1.1) #' rescale(x, add = 0.5) #' #' # expand range by different values #' rescale(x, add = c(1, 3)) #' #' # Specify list of multipliers #' d <- data.frame(x = 5:15, y = 5:15) #' rescale(d, multiply = list(x = 1.1, y = 0.5)) #' @export rescale <- function(x, ...) { UseMethod("rescale") } #' @rdname rescale #' @export change_scale <- function(x, ...) { # Alias for rescale() rescale(x, ...) } #' @export rescale.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( paste0( "Variables of class `", class(x)[1], "` can't be rescaled and remain unchanged." ) ) } x } #' @rdname rescale #' @export rescale.numeric <- function( x, to = c(0, 100), multiply = NULL, add = NULL, range = NULL, verbose = TRUE, ... ) { if (is.null(to)) { return(x) } # Warning if all NaNs if (all(is.na(x))) { return(x) } if (is.null(range)) { range <- c(min(x, na.rm = TRUE), max(x, na.rm = TRUE)) } # check if user specified "multiply" or "add", and then update "to" to <- .update_to(x, to, multiply, add) # called from "makepredictcal()"? Then we have additional arguments dot_args <- list(...) required_dot_args <- c("min_value", "max_value", "new_min", "new_max") flag_predict <- FALSE if (all(required_dot_args %in% names(dot_args))) { # we gather informatiom about the original data, which is needed # for "predict()" to work properly when "rescale()" is called # in formulas on-the-fly, e.g. "lm(mpg ~ rescale(hp), data = mtcars)" min_value <- dot_args$min_value max_value <- dot_args$max_value new_min <- dot_args$new_min new_max <- dot_args$new_max flag_predict <- TRUE } else { min_value <- ifelse(is.na(range[1]), min(x, na.rm = TRUE), range[1]) max_value <- ifelse(is.na(range[2]), max(x, na.rm = TRUE), range[2]) new_min <- ifelse(is.na(to[1]), min_value, to[1]) new_max <- ifelse(is.na(to[2]), max_value, to[2]) } # Warning if only one value if (!flag_predict && insight::has_single_value(x) && is.null(range)) { if (verbose) { insight::format_warning( "A `range` must be provided for data with only one unique value." ) } return(x) } out <- as.vector( (new_max - new_min) / (max_value - min_value) * (x - min_value) + new_min ) attr(out, "min_value") <- min_value attr(out, "max_value") <- max_value attr(out, "new_min") <- new_min attr(out, "new_max") <- new_max attr(out, "range_difference") <- max_value - min_value attr(out, "to_range") <- c(new_min, new_max) # don't add attribute when we call data frame methods if (!isFALSE(dot_args$add_transform_class)) { class(out) <- c("dw_transformer", class(out)) } out } #' @export rescale.grouped_df <- function( x, select = NULL, exclude = NULL, to = c(0, 100), multiply = NULL, add = NULL, range = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE, ... ) { info <- attributes(x) grps <- attr(x, "groups", exact = TRUE)[[".rows"]] # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, remove_group_var = TRUE, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments my_args <- .process_append( x, select, append, append_suffix = "_r", preserve_value_labels = TRUE ) # update processed arguments x <- my_args$x select <- my_args$select } x <- as.data.frame(x) for (rows in grps) { x[rows, ] <- rescale( x[rows, , drop = FALSE], select = select, exclude = exclude, to = to, multiply = multiply, add = add, range = range, append = FALSE, # need to set to FALSE here, else variable will be doubled add_transform_class = FALSE, ... ) } # set back class, so data frame still works with dplyr attributes(x) <- utils::modifyList(info, attributes(x)) x } #' @rdname rescale #' @export rescale.data.frame <- function( x, select = NULL, exclude = NULL, to = c(0, 100), multiply = NULL, add = NULL, range = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments my_args <- .process_append( x, select, append, append_suffix = "_r" ) # update processed arguments x <- my_args$x select <- my_args$select } # Transform the range so that it is a list now if (!is.null(range) && !is.list(range)) { range <- stats::setNames(rep(list(range), length(select)), select) } # Transform the 'to' so that it is a list now if (!is.list(to)) { to <- stats::setNames(rep(list(to), length(select)), select) } # Transform the 'multiply' so that it is a list now if (!is.null(multiply) && !is.list(multiply)) { multiply <- stats::setNames(rep(list(multiply), length(select)), select) } # Transform the 'add' so that it is a list now if (!is.null(add) && !is.list(add)) { add <- stats::setNames(rep(list(add), length(select)), select) } # update "to" if user specified "multiply" or "add" to[] <- lapply(names(to), function(i) { .update_to(x[[i]], to[[i]], multiply[[i]], add[[i]]) }) x[select] <- as.data.frame(sapply( select, function(n) { rescale( x[[n]], to = to[[n]], range = range[[n]], add_transform_class = FALSE ) }, simplify = FALSE )) x } # helper ---------------------------------------------------------------------- # expand the new target range by multiplying or adding .update_to <- function(x, to, multiply, add) { # check if user specified "multiply" or "add", and if not, return "to" if (is.null(multiply) && is.null(add)) { return(to) } # only one of "multiply" or "add" can be specified if (!is.null(multiply) && !is.null(add)) { insight::format_error("Only one of `multiply` or `add` can be specified.") } # multiply? If yes, calculate the "add" value if (!is.null(multiply)) { # check for correct length if (length(multiply) > 1) { insight::format_error("The length of `multiply` must be 1.") } add <- (diff(range(x, na.rm = TRUE)) * (multiply - 1)) / 2 } # add? if (!is.null(add)) { # add must be of length 1 or 2 if (length(add) > 2) { insight::format_error("The length of `add` must be 1 or 2.") } # if add is of length 2, then the first value is used for the lower bound # and the second value for the upper bound if (length(add) == 2) { add_low <- add[1] add_high <- add[2] } else { add_low <- add_high <- add } to <- c(min(x, na.rm = TRUE) - add_low, max(x, na.rm = TRUE) + add_high) } to } ================================================ FILE: R/data_restoretype.R ================================================ #' Restore the type of columns according to a reference data frame #' #' @param data A data frame for which to restore the column types. #' @inheritParams data_to_long #' @inheritParams data_rename #' @param reference A reference data frame from which to find the correct #' column types. If `NULL`, each column is converted to numeric if it doesn't #' generate `NA`s. For example, `c("1", "2")` can be converted to numeric but not #' `c("Sepal.Length")`. #' @return #' #' A data frame with columns whose types have been restored based on the #' reference data frame. #' #' @examples #' data <- data.frame( #' Sepal.Length = c("1", "3", "2"), #' Species = c("setosa", "versicolor", "setosa"), #' New = c("1", "3", "4") #' ) #' #' fixed <- data_restoretype(data, reference = iris) #' summary(fixed) #' @export data_restoretype <- function(data, reference = NULL, ...) { for (col in names(data)) { # No reference data (regular fixing) ---------------- if (is.null(reference)) { if (is.character(data[[col]])) { data[[col]] <- coerce_to_numeric(data[[col]]) } } else { if (is.factor(reference[[col]]) && !is.factor(data[[col]])) { # Restore factor levels data[[col]] <- factor(data[[col]], levels = levels(reference[[col]])) } if (is.numeric(reference[[col]]) && !is.numeric(data[[col]])) { data[[col]] <- coerce_to_numeric(as.character(data[[col]])) } if (is.character(reference[[col]]) && !is.character(data[[col]])) { data[[col]] <- as.character(data[[col]]) } } } data } ================================================ FILE: R/data_reverse.R ================================================ #' Reverse-Score Variables #' #' Reverse-score variables (change the keying/scoring direction). #' #' @param range Range of values that is used as reference for reversing the #' scale. For numeric variables, can be `NULL` or a numeric vector of length #' two, indicating the lowest and highest value of the reference range. If #' `NULL`, will take the range of the input vector (`range(x)`). For factors, #' `range` can be `NULL`, a numeric vector of length two, or a (numeric) #' vector of at least the same length as factor levels (i.e. must be equal #' to or larger than `nlevels(x)`). Note that providing a `range` for factors #' usually only makes sense when factor levels are numeric, not characters. #' @param ... Arguments passed to or from other methods. #' @inheritParams categorize #' @inheritParams extract_column_names #' #' @inheritSection center Selection of variables - the `select` argument #' #' @examples #' reverse(c(1, 2, 3, 4, 5)) #' reverse(c(-2, -1, 0, 2, 1)) #' #' # Specify the "theoretical" range of the input vector #' reverse(c(1, 3, 4), range = c(0, 4)) #' #' # Factor variables #' reverse(factor(c(1, 2, 3, 4, 5))) #' reverse(factor(c(1, 2, 3, 4, 5)), range = 0:10) #' #' # Data frames #' head(reverse(iris)) #' head(reverse(iris, select = "Sepal.Length")) #' #' @return A reverse-scored object. #' #' @family transform utilities #' #' @inherit data_rename seealso #' #' @export reverse <- function(x, ...) { UseMethod("reverse") } #' @rdname reverse #' @export reverse_scale <- reverse #' @export reverse.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( paste0( "Variables of class '", class(x)[1], "' can't be recoded and remain unchanged." ) ) } x } #' @rdname reverse #' @export reverse.numeric <- function(x, range = NULL, verbose = TRUE, ...) { # Warning if all NaNs if (all(is.na(x))) { return(x) } # Warning if only one value if (insight::has_single_value(x) && is.null(range)) { if (verbose) { insight::format_warning( "A `range` must be provided for data with only one unique value." ) } return(x) } # no missing values allowed if (anyNA(range)) { insight::format_error("`range` is not allowed to have missing values.") } if (is.null(range)) { range <- c(min(x, na.rm = TRUE), max(x, na.rm = TRUE)) } # old minimum and maximum min_value <- min(range) max_value <- max(range) # check if a valid range (i.e. vector of length 2) is provided if (length(range) > 2) { insight::format_error( "`range` must be a numeric vector of length two, indicating lowest and highest value of the required range.", sprintf( "Did you want to provide `range = c(%g, %g)`?", min_value, max_value ) ) } new_min <- max_value new_max <- min_value out <- as.vector( (new_max - new_min) / (max_value - min_value) * (x - min_value) + new_min ) # labelled data? out <- .set_back_labels(out, x, reverse_values = TRUE) out } #' @export reverse.factor <- function(x, range = NULL, verbose = TRUE, ...) { # Warning if all NaNs if (all(is.na(x))) { return(x) } # Warning if only one value if (insight::has_single_value(x) && is.null(range)) { if (verbose) { insight::format_warning( "A `range` must be provided for data with only one unique value." ) } return(x) } # save for later use original_x <- x if (is.null(range)) { old_levels <- levels(x) } else { # no missing values allowed if (anyNA(range)) { insight::format_error("`range` is not allowed to have missing values.") } range_ok <- TRUE # if we have a vector of length 2 for range, and more factor levels, # we assume `range` indicates minimum and maximum of range values if (length(range) == 2 && nlevels(droplevels(x)) > 2) { if (is.numeric(range)) { range <- min(range):max(range) } else { # if range is of length 2, and we have more than 2 number of levels, # range must be numeric to indicate minima and maxima. if not, stop. range_ok <- FALSE } } if (length(range) > 2 && length(range) < nlevels(droplevels(x))) { # if range has more than two values, but fewer values than number of # factor levels, we cannot associate the reversed scale, so stop range_ok <- FALSE } if (!range_ok) { insight::format_error( "`range` must be one of the following:", "- a numeric vector of length two, indicating lowest and highest value of the required range,", "- a vector (numeric or character) of values with at least as many values as number of levels in `x`,", "- or `NULL`." ) } # check if no or not all old levels are in new range if (verbose) { if (!any(levels(x) %in% as.character(range))) { insight::format_warning( "No current factor level is included in `range`.", "Returned factor will only contain missing values." ) } else if (!all(levels(x) %in% as.character(range))) { insight::format_warning( "Not all current factor levels are included in `range`.", "Returned factor will contain missing values." ) } } old_levels <- range x <- factor(x, levels = range) } int_x <- as.integer(x) rev_x <- reverse(int_x, range = c(1, length(old_levels))) x <- factor(rev_x, levels = seq_along(old_levels), labels = old_levels) # labelled data? x <- .set_back_labels(x, original_x, reverse_values = TRUE) x } #' @export reverse.grouped_df <- function( x, select = NULL, exclude = NULL, range = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE, ... ) { info <- attributes(x) grps <- attr(x, "groups", exact = TRUE)[[".rows"]] # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, remove_group_var = TRUE, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments arguments <- .process_append( x, select, append, append_suffix = "_r", preserve_value_labels = TRUE ) # update processed arguments x <- arguments$x select <- arguments$select } x <- as.data.frame(x) for (rows in grps) { x[rows, ] <- reverse( x[rows, , drop = FALSE], select = select, exclude = exclude, range = range, append = FALSE, # need to set to FALSE here, else variable will be doubled ... ) } # set back class, so data frame still works with dplyr attributes(x) <- utils::modifyList(info, attributes(x)) x } #' @rdname reverse #' @export reverse.data.frame <- function( x, select = NULL, exclude = NULL, range = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments arguments <- .process_append( x, select, append, append_suffix = "_r", preserve_value_labels = TRUE ) # update processed arguments x <- arguments$x select <- arguments$select } # Transform the range so that it is a list now if (!is.null(range) && !is.list(range)) { range <- stats::setNames(rep(list(range), length(select)), select) } x[select] <- lapply(select, function(n) { reverse(x[[n]], range = range[[n]]) }) x } ================================================ FILE: R/data_rotate.R ================================================ #' @title Rotate a data frame #' @name data_rotate #' #' @description #' This function rotates a data frame, i.e. columns become rows and vice versa. #' It's the equivalent of using `t()` but restores the `data.frame` class, #' preserves attributes and prints a warning if the data type is #' modified (see example). #' #' @param data A data frame. #' @param rownames Character vector (optional). If not `NULL`, the data frame's #' rownames will be added as (first) column to the output, with `rownames` #' being the name of this column. #' @param colnames Logical or character vector (optional). If `TRUE`, the values #' of the first column in `x` will be used as column names in the rotated data #' frame. If a character vector, values from that column are used as column #' names. #' @param verbose Toggle warnings. #' #' @inherit data_rename seealso #' #' @return A (rotated) data frame. #' #' @examples #' x <- mtcars[1:3, 1:4] #' #' x #' #' data_rotate(x) #' data_rotate(x, rownames = "property") #' #' # use values in 1. column as column name #' data_rotate(x, colnames = TRUE) #' data_rotate(x, rownames = "property", colnames = TRUE) #' #' # use either first column or specific column for column names #' x <- data.frame(a = 1:5, b = 11:15, c = 21:25) #' data_rotate(x, colnames = TRUE) #' data_rotate(x, colnames = "c") #' #' @export data_rotate <- function( data, rownames = NULL, colnames = FALSE, verbose = TRUE ) { # copy attributes attr_data <- attributes(data) # check if first column has column names to be used for rotated data if (isTRUE(colnames)) { colnames <- data[[1]] data <- data[-1] } else if ( !is.null(colnames) && is.character(colnames) && colnames %in% colnames(data) ) { cn_col <- which(colnames(data) == colnames) colnames <- data[[colnames]] data <- data[-cn_col] } else { colnames <- row.names(data) } # warning after possible removal of columns if ( verbose && insight::n_unique(vapply(data, typeof, FUN.VALUE = character(1L))) > 1L ) { insight::format_warning( "Your data frame contains mixed types of data. After transposition, all variables will be transformed into characters." ) # nolint } # rotate data frame by 90 degrees out <- as.data.frame(t(as.data.frame(data))) # add column names, if requested if (!is.null(colnames)) { # check if we have correct length of column names if (length(colnames) != ncol(out)) { insight::format_warning( "Length of provided column names does not match number of columns. No column names changed." ) } else { colnames(out) <- colnames } } # add rownames as a new column, if requested if (!is.null(rownames)) { out <- rownames_as_column(out, var = rownames) } out <- .replace_attrs(out, attr_data) out } #' @rdname data_rotate #' @export data_transpose <- data_rotate ================================================ FILE: R/data_seek.R ================================================ #' @title Find variables by their names, variable or value labels #' @name data_seek #' #' @description This functions seeks variables in a data frame, based on patterns #' that either match the variable name (column name), variable labels, value labels #' or factor levels. Matching variable and value labels only works for "labelled" #' data, i.e. when the variables either have a `label` attribute or `labels` #' attribute. #' #' `data_seek()` is particular useful for larger data frames with labelled #' data - finding the correct variable name can be a challenge. This function #' helps to find the required variables, when only certain patterns of variable #' names or labels are known. #' #' @param data A data frame. #' @param pattern Character string (regular expression) to be matched in `data`. #' May also be a character vector of length > 1. `pattern` is searched for in #' column names, variable label and value labels attributes, or factor levels of #' variables in `data`. #' @param seek Character vector, indicating where `pattern` is sought. Use one #' or more of the following options: #' #' - `"names"`: Searches in column names. `"column_names"` and `"columns"` are #' aliases for `"names"`. #' - `"labels"`: Searches in variable labels. Only applies when a `label` attribute #' is set for a variable. #' - `"values"`: Searches in value labels or factor levels. Only applies when a #' `labels` attribute is set for a variable, or if a variable is a factor. #' `"levels"` is an alias for `"values"`. #' - `"all"`: Searches in all of the above. #' @param fuzzy Logical. If `TRUE`, "fuzzy matching" (partial and close distance #' matching) will be used to find `pattern`. #' #' @return A data frame with three columns: the column index, the column name #' and - if available - the variable label of all matched variables in `data`. #' #' @examples #' # seek variables with "Length" in variable name or labels #' data_seek(iris, "Length") #' #' # seek variables with "dependency" in names or labels #' # column "e42dep" has a label-attribute "elder's dependency" #' data(efc) #' data_seek(efc, "dependency") #' #' # "female" only appears as value label attribute - default search is in #' # variable names and labels only, so no match #' data_seek(efc, "female") #' # when we seek in all sources, we find the variable "e16sex" #' data_seek(efc, "female", seek = "all") #' #' # typo, no match #' data_seek(iris, "Lenght") #' # typo, fuzzy match #' data_seek(iris, "Lenght", fuzzy = TRUE) #' @export data_seek <- function( data, pattern, seek = c("names", "labels"), fuzzy = FALSE ) { # check valid args if (!is.data.frame(data)) { insight::format_error("`data` must be a data frame.") } # check valid args seek <- intersect( seek, c("names", "labels", "values", "levels", "column_names", "columns", "all") ) if (is.null(seek) || !length(seek)) { insight::format_error( "`seek` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\"." ) # nolint } pos1 <- pos2 <- pos3 <- NULL pos <- unlist(lapply(pattern, function(search_pattern) { # search in variable names? if (any(seek %in% c("names", "columns", "column_names", "all"))) { pos1 <- grep(search_pattern, colnames(data)) # find in near distance? if (fuzzy) { pos1 <- c( pos1, .fuzzy_grep(x = colnames(data), pattern = search_pattern) ) } } # search in variable labels? if (any(seek %in% c("labels", "all"))) { var_labels <- insight::compact_character(unlist(lapply( data, attr, which = "label", exact = TRUE ))) if (!is.null(var_labels) && length(var_labels)) { found <- grepl(search_pattern, var_labels) pos2 <- match(names(var_labels)[found], colnames(data)) # find in near distanc? if (fuzzy) { found <- .fuzzy_grep(x = var_labels, pattern = search_pattern) if (length(found)) { pos2 <- c(pos2, match(names(var_labels)[found], colnames(data))) } } } } # search for pattern in value labels or levels? if (any(seek %in% c("values", "levels", "all"))) { values <- insight::compact_list(lapply(data, function(i) { l <- attr(i, "labels", exact = TRUE) if (is.null(l) && is.factor(i)) { levels(i) } else { names(l) } })) if (!is.null(values) && length(values)) { found <- vapply( values, function(i) any(grepl(search_pattern, i)), logical(1) ) pos3 <- match(names(found)[found], colnames(data)) # find in near distance if (fuzzy) { found <- vapply( values, function(i) { length(.fuzzy_grep(x = i, pattern = search_pattern)) > 0 }, logical(1) ) if (any(found)) { pos3 <- c(pos3, match(names(found)[found], colnames(data))) } } } } c(pos1, pos2, pos3) })) # clean up pos <- unique(pos) # variable labels of matching variables var_labels <- vapply( colnames(data[pos]), function(i) { l <- attr(data[[i]], "label", exact = TRUE) if (is.null(l)) { i } else { l } }, character(1) ) out <- data.frame( index = pos, column = colnames(data)[pos], labels = var_labels, stringsAsFactors = FALSE ) # no row names rownames(out) <- NULL class(out) <- c("data_seek", "data.frame") out } # methods --------------------------------------------------------------------- #' @export print.data_seek <- function(x, ...) { if (nrow(x) == 0) { cat("No matches found.\n") } else { cat(insight::export_table(x, ...)) } } ================================================ FILE: R/data_select.R ================================================ #' @rdname extract_column_names #' @export data_select <- function( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { columns <- .select_nse( select, data, exclude, ignore_case = ignore_case, regex = regex, allow_rename = TRUE, verbose = FALSE ) # save attributes a <- attributes(data) if (!length(columns) || is.null(columns)) { if (isTRUE(verbose)) { insight::format_warning( "No column names that matched the required search pattern were found." ) } return(NULL) } out <- data[columns] # for named character vectors, we offer the service to directly rename the columns if (!is.null(names(columns))) { colnames(out) <- names(columns) } # add back attributes out <- .replace_attrs(out, a) out } ================================================ FILE: R/data_separate.R ================================================ #' @title Separate single variable into multiple variables #' @name data_separate #' #' @description #' Separates a single variable into multiple new variables. #' #' @param data A data frame. #' @param new_columns The names of the new columns, as character vector. If #' more than one variable was selected (in `select`), the new names are prefixed #' with the name of the original column. `new_columns` can also be a list of #' (named) character vectors when multiple variables should be separated. See #' 'Examples'. #' @param separator Separator between columns. Can be a character vector, which #' is then treated as regular expression, or a numeric vector that indicates at #' which positions the string values will be split. #' @param append Logical, if `FALSE` (default), removes original columns that #' were separated. If `TRUE`, all columns are preserved and the new columns are #' appended to the data frame. #' @param guess_columns If `new_columns` is not given, the required number of #' new columns is guessed based on the results of value splitting. For example, #' if a variable is split into three new columns, this will be considered as #' the required number of new columns, and columns are named `"split_1"`, #' `"split_2"` and `"split_3"`. When values from a variable are split into #' different amount of new columns, the `guess_column` can be either `"mode"` #' (number of new columns is based on the most common number of splits), `"min"` #' or `"max"` to use the minimum resp. maximum number of possible splits as #' required number of columns. #' @param fill How to deal with values that return fewer new columns after #' splitting? Can be `"left"` (fill missing columns from the left with `NA`), #' `"right"` (fill missing columns from the right with `NA`) or `"value_left"` #' or `"value_right"` to fill missing columns from left or right with the #' left-most or right-most values. #' @param extra How to deal with values that return too many new columns after #' splitting? Can be `"drop_left"` or `"drop_right"` to drop the left-most or #' right-most values, or `"merge_left"` or `"merge_right"` to merge the left- #' or right-most value together, and keeping all remaining values as is. #' @param merge_multiple Logical, if `TRUE` and more than one variable is selected #' for separating, new columns can be merged. Value pairs of all split variables #' are merged. #' @param merge_separator Separator string when `merge_multiple = TRUE`. Defines #' the string that is used to merge values together. #' @param convert_na Logical, if `TRUE`, character `"NA"` values are converted #' into real `NA` values. #' @param ... Currently not used. #' @inheritParams extract_column_names #' #' @seealso [`data_unite()`] #' #' @return A data frame with the newly created variable(s), or - when `append = TRUE` - #' `data` including new variables. #' #' @examples #' # simple case #' d <- data.frame( #' x = c("1.a.6", "2.b.7", "3.c.8"), #' stringsAsFactors = FALSE #' ) #' d #' data_separate(d, new_columns = c("a", "b", "c")) #' #' # guess number of columns #' d <- data.frame( #' x = c("1.a.6", NA, "2.b.6.7", "3.c", "x.y.z"), #' stringsAsFactors = FALSE #' ) #' d #' data_separate(d, guess_columns = "mode") #' #' data_separate(d, guess_columns = "max") #' #' # drop left-most column #' data_separate(d, guess_columns = "mode", extra = "drop_left") #' #' # merge right-most column #' data_separate(d, guess_columns = "mode", extra = "merge_right") #' #' # fill columns with fewer values with left-most values #' data_separate(d, guess_columns = "mode", fill = "value_left") #' #' # fill and merge #' data_separate( #' d, #' guess_columns = "mode", #' fill = "value_left", #' extra = "merge_right" #' ) #' #' # multiple columns to split #' d <- data.frame( #' x = c("1.a.6", "2.b.7", "3.c.8"), #' y = c("x.y.z", "10.11.12", "m.n.o"), #' stringsAsFactors = FALSE #' ) #' d #' # split two columns, default column names #' data_separate(d, guess_columns = "mode") #' #' # split into new named columns, repeating column names #' data_separate(d, new_columns = c("a", "b", "c")) #' #' # split selected variable new columns #' data_separate(d, select = "y", new_columns = c("a", "b", "c")) #' #' # merge multiple split columns #' data_separate( #' d, #' new_columns = c("a", "b", "c"), #' merge_multiple = TRUE #' ) #' #' # merge multiple split columns #' data_separate( #' d, #' new_columns = c("a", "b", "c"), #' merge_multiple = TRUE, #' merge_separator = "-" #' ) #' #' # separate multiple columns, give proper column names #' d_sep <- data.frame( #' x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), #' y = c("m.n.99.22", "77.f.g.34", "44.9", NA), #' stringsAsFactors = FALSE #' ) #' #' data_separate( #' d_sep, #' select = c("x", "y"), #' new_columns = list( #' x = c("A", "B", "C"), # separate "x" into three columns #' y = c("EE", "FF", "GG", "HH") # separate "y" into four columns #' ), #' verbose = FALSE #' ) #' @export data_separate <- function( data, select = NULL, new_columns = NULL, separator = "[^[:alnum:]]+", guess_columns = NULL, merge_multiple = FALSE, merge_separator = "", fill = "right", extra = "drop_right", convert_na = TRUE, exclude = NULL, append = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) { # we need at least one explicit choice for either `new_columns` or `guess_columns` if (is.null(new_columns) && is.null(guess_columns)) { insight::format_error( "Cannot separate values. Either `new_columns` or `guess_columns` must be provided." ) } # in case user did not provide names of new columns, we can try # to guess number of columns per variable guess_columns <- match.arg(guess_columns, choices = c("min", "max", "mode")) # make sure we have valid options for fill and extra fill <- match.arg( fill, choices = c("left", "right", "value_left", "value_right") ) extra <- match.arg( extra, choices = c("drop_left", "drop_right", "merge_left", "merge_right") ) # evaluate select/exclude, may be select-helpers select <- .select_nse( select, data, exclude, ignore_case, regex = regex, verbose = verbose ) # make new_columns as list, this works with single and multiple columns if (!is.null(new_columns) && !is.list(new_columns)) { new_columns <- rep(list(new_columns), times = length(select)) # if we have multiple columns that were separated, we avoid duplicated # column names of created variables by appending name of original column # however, we don't have duplicated column names when we merge them together # so don't create new column names when "merge_multiple" is FALSE. make_unique_colnames <- length(select) > 1 && !merge_multiple } else { # we don't want to create own unique column names when user explicitly # provided column names as a list, i.e. column names for each separated # variable make_unique_colnames <- FALSE } # make sure list of new column names is named if (!is.null(new_columns) && is.null(names(new_columns))) { names(new_columns) <- select } # iterate columns that should be split split_data <- lapply(select, function(sep_column) { # do we have known number of columns? if (is.null(new_columns)) { n_columns <- NULL } else { n_columns <- length(new_columns[[sep_column]]) } # make sure we have a character that we can split x <- data[[sep_column]] if (!is.character(x)) { x <- as.character(x) } # separate column into multiple strings if (is.numeric(separator)) { maxlen <- max(nchar(x), na.rm = TRUE) starts <- c(0, separator) ends <- c(separator - 1, maxlen) separated_columns <- lapply(seq_along(starts), function(i) { substr(x, starts[i], ends[i]) }) separated_columns <- as.data.frame( do.call(rbind, separated_columns), stringsAsFactors = FALSE ) } else { separated_columns <- strsplit(x, separator, perl = TRUE) } # how many new columns do we need? if (is.null(n_columns)) { # lengths of all split strings l <- lengths(separated_columns) # but without NA values l <- l[!vapply(l, function(i) all(is.na(i)), TRUE)] # define number of new columns, based on user-choice n_cols <- switch( guess_columns, min = min(l, na.rm = TRUE), max = max(l, na.rm = TRUE), mode = distribution_mode(l), ) # tell user if (verbose && insight::n_unique(l) != 1 && !is.numeric(separator)) { insight::format_alert(paste0( "Column `", sep_column, "` had different number of values after splitting. Variable was split into ", n_cols, " column", ifelse(n_cols > 1, "s", ""), "." )) } } else { # else, if we know number of columns, use that number n_cols <- n_columns } # main task here - fill or drop values for all columns separated_columns <- tryCatch( .fix_separated_columns( separated_columns, fill, extra, n_cols, sep_column, verbose ), error = function(e) NULL ) # catch error if (is.null(separated_columns)) { insight::format_error( "Something went wrong. Probably the number of provided column names did not match number of newly created columns?" # nolint ) } # bind separated columns into data frame and set column names out <- as.data.frame(do.call(rbind, separated_columns)) # if no column names provided, use standard names if (is.null(new_columns[[sep_column]])) { new_column_names <- paste0(sep_column, "_", seq_along(out)) } else if (make_unique_colnames) { # if we have multiple columns that were separated, we avoid duplicated # column names of created variables by appending name of original column new_column_names <- paste0(sep_column, "_", new_columns[[sep_column]]) } else { new_column_names <- new_columns[[sep_column]] } colnames(out) <- new_column_names out }) # any split performed? if (all(lengths(split_data) == 1)) { if (verbose) { insight::format_alert( "Separator probably not found. No values were split. Returning original data." ) } return(data) } # final preparation, bind or merge columns, make unique columm names if (isTRUE(merge_multiple) && length(split_data) > 1) { # we merge all split columns, which are currently saved as list # of data frames, together into one data frame for (i in 2:length(split_data)) { for (j in seq_along(split_data[[1]])) { split_data[[1]][[j]] <- gsub( " ", "", paste( split_data[[1]][[j]], split_data[[i]][[j]], sep = merge_separator ), fixed = TRUE ) } } split_data <- split_data[[1]] } else { # bind all columns split_data <- do.call(cbind, split_data) } # convert "NA" strings into real NA? if (convert_na) { split_data[] <- lapply(split_data, function(i) { i[i == "NA"] <- NA_character_ i }) } data <- cbind(data, split_data) if (!isTRUE(append)) { data[select] <- NULL } # fin data } #' @keywords internal .fix_separated_columns <- function( separated_columns, fill, extra, n_cols, sep_column, verbose = TRUE ) { warn_extra <- warn_fill <- FALSE for (sc in seq_along(separated_columns)) { i <- separated_columns[[sc]] # determine number of values in separated column n_values <- length(i) if (all(is.na(i))) { # we have NA values - so fill everything with NA out <- rep(NA_character_, times = n_cols) } else if (n_values > n_cols) { # we have more values than required - drop extra columns out <- switch( extra, drop_left = i[(n_values - n_cols + 1):n_values], drop_right = i[1:n_cols], merge_left = { tmp <- paste(i[1:(n_values - n_cols + 1)], collapse = " ") c(tmp, i[(n_values - n_cols + 2):n_values]) }, { tmp <- i[1:(n_cols - 1)] c(tmp, paste(i[n_cols:n_values], collapse = " ")) } ) warn_extra <- TRUE } else if (n_values < n_cols) { # we have fewer values than required - fill columns out <- switch( fill, left = c(rep(NA_character_, times = n_cols - n_values), i), right = c(i, rep(NA_character_, times = n_cols - n_values)), value_left = c(rep(i[1], times = n_cols - n_values), i), c(i, rep(i[length(i)], times = n_cols - n_values)) ) warn_fill <- TRUE } else { out <- i } separated_columns[[sc]] <- out } if (verbose) { if (warn_extra) { insight::format_alert(paste0( "`", sep_column, "`", " returned more columns than expected after splitting. ", switch( extra, drop_left = "Left-most columns have been dropped.", drop_right = "Right-most columns have been dropped.", merge_left = "Left-most columns have been merged together.", merge_right = "Right-most columns have been merged together." ) )) } if (warn_fill) { insight::format_alert(paste0( "`", sep_column, "`", "returned fewer columns than expected after splitting. ", switch( fill, left = "Left-most columns were filled with `NA`.", right = "Right-most columns were filled with `NA`.", value_left = "Left-most columns were filled with first value.", value_right = "Right-most columns were filled with last value." ) )) } } separated_columns } ================================================ FILE: R/data_summary.R ================================================ #' @title Summarize data #' @name data_summary #' #' @description This function can be used to compute summary statistics for a #' data frame or a matrix. #' #' @param x A (grouped) data frame. #' @param by Optional character string, indicating the names of one or more #' variables in the data frame. If supplied, the data will be split by these #' variables and summary statistics will be computed for each group. #' @param remove_na Logical. If `TRUE`, missing values are omitted from the #' grouping variable. If `FALSE` (default), missing values are included as a #' level in the grouping variable. #' @param suffix Optional, suffixes to be added to the new variable names, #' especially useful when a function returns several values (e.g. `quantile()`). #' Can be: #' * a character vector: all expressions in `...` must return the same number #' of values as elements in `suffix`. #' * a list of named character vectors: the names of elements in `suffix` must #' match the names of the expressions. It is also allowed to specify suffixes #' for selected expressions only. #' #' The new column names are a combination of the left-hand side (i.e., #' the name) of the expression and the related suffixes. If `suffix = NULL` (the #' default), and a summary expression returns multiple values, either the names #' of the returned values (if any) or automatically numbered suffixes such as #' `_1`, `_2`, etc. are used. See 'Examples'. #' @param ... One or more named expressions that define the new variable name #' and the function to compute the summary statistic. Example: #' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided #' as a character string, e.g. `"mean_sepal_width = mean(Sepal.Width)"`. The #' summary function `n()` can be used to count the number of observations. #' #' @return A data frame with the requested summary statistics. #' #' @examples #' data(iris) #' data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) #' data_summary( #' iris, #' MW = mean(Sepal.Width), #' SD = sd(Sepal.Width), #' by = "Species" #' ) #' #' # same as #' d <- data_group(iris, "Species") #' data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) #' #' # multiple groups #' data(mtcars) #' data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear")) #' #' # expressions can also be supplied as character strings #' data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear")) #' #' # count observations within groups #' data_summary(mtcars, observations = n(), by = c("am", "gear")) #' #' # first and last observations of "mpg" within groups #' data_summary( #' mtcars, #' first = mpg[1], #' last = mpg[length(mpg)], #' by = c("am", "gear") #' ) #' #' # allow more than one-column-summaries for expressions #' d <- data.frame( #' x = rnorm(100, 1, 1), #' y = rnorm(100, 2, 2), #' groups = rep(1:4, each = 25) #' ) #' #' # since we have multiple columns for one expression, the names of the #' # returned summary results are used as suffix by default #' data_summary( #' d, #' quant_x = quantile(x, c(0.25, 0.75)), #' mean_x = mean(x), #' quant_y = quantile(y, c(0.25, 0.5, 0.75)) #' ) #' #' # if a summary function, like `fivenum()`, returns no named vector, suffixes #' # are automatically numbered #' data_summary( #' d, #' quant_x = quantile(x, c(0.25, 0.75)), #' mean_x = mean(x), #' fivenum_y = fivenum(y) #' ) #' #' # specify column suffix for expressions, matching by names #' data_summary( #' d, #' quant_x = quantile(x, c(0.25, 0.75)), #' mean_x = mean(x), #' quant_y = quantile(y, c(0.25, 0.5, 0.75)), #' suffix = list(quant_y = c("_Q1", "_Q2", "_Q3")) #' ) #' #' # name multiple expression suffixes, grouped by variable #' data_summary( #' d, #' quant_x = quantile(x, c(0.25, 0.75)), #' mean_x = mean(x), #' quant_y = quantile(y, c(0.25, 0.5, 0.75)), #' suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q3")), #' by = "groups" #' ) #' #' @export data_summary <- function(x, ...) { UseMethod("data_summary") } #' @export data_summary.matrix <- function( x, ..., by = NULL, remove_na = FALSE, suffix = NULL ) { data_summary( as.data.frame(x), ..., by = by, remove_na = remove_na, suffix = suffix ) } #' @export data_summary.default <- function(x, ...) { insight::format_error( "`data_summary()` only works for (grouped) data frames and matrices." ) } #' @rdname data_summary #' @export data_summary.data.frame <- function( x, ..., by = NULL, remove_na = FALSE, suffix = NULL ) { dots <- eval(substitute(alist(...))) # do we have any expression at all? if (length(dots) == 0) { insight::format_error( "No expressions for calculating summary statistics provided." ) } if (is.null(by)) { # when we have no grouping, just compute a one-row summary summarise <- .process_datasummary_dots(dots, x, suffix) # coerce to data frame out <- as.data.frame(t(summarise)) colnames(out) <- names(summarise) } else { # sanity check - is "by" a character string? if (!is.character(by)) { insight::format_error( "Argument `by` must be a character string indicating the name of variables in the data." ) } # is "by" in the data? if (!all(by %in% colnames(x))) { by_not_found <- by[!by %in% colnames(x)] insight::format_error( paste0( "Variable", ifelse(length(by_not_found) > 1, "s ", " "), text_concatenate(by_not_found, enclose = "\""), " not found in the data." ), .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?") ) } # split data, add NA levels, if requested l <- lapply(x[by], function(i) { if (remove_na || !anyNA(i)) { i } else { addNA(i) } }) split_data <- split(x, l, drop = TRUE) out <- lapply(split_data, function(s) { # no data for combination? Return NULL if (nrow(s) == 0) { return(NULL) } # summarize data summarise <- .process_datasummary_dots(dots, s, suffix) # coerce to data frame summarised_data <- as.data.frame(t(summarise)) # bind grouping-variables and values summarised_data <- cbind(s[1, by], summarised_data) # make sure we have proper column names colnames(summarised_data) <- c(by, names(summarise)) summarised_data }) # check for correct number of columns. If one expression returns different # number of values (which now means, we have different number of columns # to bind) for each group, tell user if (!all(lengths(out) == lengths(out)[1])) { insight::format_error( "Each expression must return the same number of values for each group. Some of the expressions seem to return varying numbers of values." ) } out <- do.call(rbind, out) } # sort data out <- data_arrange(out, select = by) # data attributes class(out) <- c("dw_data_summary", "data.frame") rownames(out) <- NULL out } #' @export data_summary.grouped_df <- function( x, ..., by = NULL, remove_na = FALSE, suffix = NULL ) { # extract group variables grps <- attr(x, "groups", exact = TRUE) group_variables <- data_remove(grps, ".rows") # if "by" is not supplied, use group variables if (is.null(by)) { by <- colnames(group_variables) } # remove information specific to grouped df's attr(x, "groups") <- NULL class(x) <- "data.frame" data_summary(x, ..., by = by, remove_na = remove_na, suffix = suffix) } # helper ----------------------------------------------------------------------- .process_datasummary_dots <- function(dots, data, suffix = NULL) { out <- NULL if (length(dots)) { # we check for character vector of expressions, in which case # "dots" should be unnamed if (is.null(names(dots))) { # if we have multiple strings, concatenate them to a character vector # and put it into a list... if (length(dots) > 1) { if (all(vapply(dots, is.character, logical(1)))) { dots <- list(unlist(dots)) } else { insight::format_error( "You cannot mix string and literal representation of expressions." ) } } # expression is given as character string, e.g. # a <- "mean_sepwid = mean(Sepal.Width)" # data_summary(iris, a, by = "Species") # or as character vector, e.g. # data_summary(iris, c("var_a = mean(Sepal.Width)", "var_b = sd(Sepal.Width)")) character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) { NULL }) # do we have a character vector? Then we can proceed if (is.character(character_symbol)) { dots <- lapply(character_symbol, function(s) { # turn value from character vector into expression str2lang(.dynEval(s)) }) names(dots) <- vapply( dots, function(n) insight::safe_deparse(n[[2]]), character(1) ) } } # sanity check: check the input for the `suffix` argument # `suffix` can be NULL, or must be a (named) list if (!is.null(suffix)) { # if `suffix` is a character vector, we transform it into a list, # matching the names of the expressions if (is.character(suffix)) { suffix <- rep(list(suffix), length(dots)) names(suffix) <- names(dots) } # no list? error if (!is.list(suffix)) { insight::format_error( "Argument `suffix` must be a list of (named) character vectors, where the names match the names of the expressions, e.g.:", paste0( "`suffix = list(", names(dots)[1], " = c(\"_suffix1\", \"_suffix2\")`." ) ) } # not all elements named? error if (!length(which(nzchar(names(suffix), keepNA = TRUE)))) { insight::format_error("All elements of `suffix` must have names.") } # names of suffix do not match names of expressions? error if (!all(names(suffix) %in% names(dots))) { wrong_name <- which(!names(suffix) %in% names(dots))[1] insight::format_error( paste0( "Names of `suffix` must match the names of the expressions. Suffix `", names(suffix)[wrong_name], "` has no corresponding expression." ) ) } # identical suffixes for one expression? error identical_suffix <- vapply( suffix, function(i) insight::n_unique(i) != length(i), logical(1) ) if (any(identical_suffix)) { insight::format_error( paste0( "All suffixes for a single expression must be unique. Suffix for element `", names(identical_suffix)[which(identical_suffix)][1], "` has duplicate values." ) ) } } out <- lapply(seq_along(dots), function(i) { new_variable <- .get_new_dots_variable(dots, i, data) # check special case here - we want bayestestR::ci to work with # data summary, to easily create CIs for, say, posterior draws if (inherits(new_variable, c("bayestestR_ci", "bayestestR_eti"))) { stats::setNames(new_variable, c("CI", "CI_low", "CI_high")) } else { # init current_suffix <- NULL # find matches and set use suffix if found matching_names <- which(names(suffix) == names(dots)[i]) # either use suffixes based on matching names, or try to extract # names from the returned summary expression (saved in "new_variable"), # if the summary function returned a named vector if (length(matching_names) > 0) { current_suffix <- suffix[[matching_names]] } else if ( length(new_variable) > 1 && all(nzchar(names(new_variable), keepNA = TRUE)) ) { current_suffix <- names(new_variable) } # if we don't have suffixes for multiple columns, but expression # returns multiple columns, we get NA column names - we use # automatically numbered suffixes in this case if (is.null(current_suffix) && length(new_variable) > 1) { current_suffix <- paste0("_", seq_along(new_variable)) } # if number of suffixes does not match the number of returned values # by the expression, error if ( !is.null(current_suffix) && length(current_suffix) != length(new_variable) ) { insight::format_error( paste0( "Argument `suffix` must have the same length as the result of the corresponding summary expression. `suffix` has ", length(current_suffix), " elements (", text_concatenate(current_suffix, enclose = "`"), ") for the expression `", insight::safe_deparse(dots[[i]]), "`, which returned ", length(new_variable), " values." ) ) } stats::setNames(new_variable, paste0(names(dots)[i], current_suffix)) } }) } unlist(out) } # methods ---------------------------------------------------------------------- #' @export print.dw_data_summary <- function(x, ...) { if (nrow(x) == 0) { cat("No matches found.\n") } else { if (all(c("CI", "CI_low", "CI_high") %in% colnames(x))) { ci <- insight::format_table(x[c("CI", "CI_low", "CI_high")], ...) x$CI <- x$CI_low <- x$CI_high <- NULL x <- cbind(x, ci) } cat(insight::export_table(x, missing = "", ...)) } } ================================================ FILE: R/data_tabulate.R ================================================ #' @title Create frequency and crosstables of variables #' @name data_tabulate #' #' @description This function creates frequency or crosstables of variables, #' including the number of levels/values as well as the distribution of raw, #' valid and cumulative percentages. For crosstables, row, column and cell #' percentages can be calculated. #' #' @param x A (grouped) data frame, a vector or factor. #' @param by Optional vector or factor. If supplied, a crosstable is created. #' If `x` is a data frame, `by` can also be a character string indicating the #' name of a variable in `x`. #' @param drop_levels Logical, if `FALSE`, factor levels that do not occur in #' the data are included in the table (with frequency of zero), else unused #' factor levels are dropped from the frequency table. #' @param name Optional character string, which includes the name that is used #' for printing. #' @param remove_na Logical, if `FALSE`, missing values are included in the #' frequency or crosstable, else missing values are omitted. Note that the #' default for the `as.table()` method is `remove_na = TRUE`, so that missing #' values are not included in the returned table, which makes more sense for #' post-processing of the table, e.g. using `chisq.test()`. #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger #' table for printing. This affects only printing, not the returned object. #' @param weights Optional numeric vector of weights. Must be of the same length #' as `x`. If `weights` is supplied, weighted frequencies are calculated. #' @param proportions Optional character string, indicating the type of #' percentages to be calculated. Only applies to crosstables, i.e. when `by` is #' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages) #' or `"full"` (to calculate relative frequencies for the full table). #' @param big_mark Optional character string, indicating the big mark that is #' used for large numbers. If `NULL` (default), a big mark is added automatically for #' large numbers (i.e. numbers with more than 5 digits). If you want to remove #' the big mark, set `big_mark = ""`. #' @param object An object returned by `data_tabulate()`. #' @param format String, indicating the output format. Can be `"markdown"` #' `"html"`, or `"tt"`. `format = "html"` create an HTML table using the *gt* #' package. `format = "tt"` creates a `tinytable` object, which is either #' printed as markdown or HTML table, depending on the environment. See #' [`insight::export_table()`] for details. #' @param verbose Toggle warnings and messages. #' @param ... not used. #' @inheritParams extract_column_names #' #' @details #' There is an `as.data.frame()` method, to return the frequency tables as a #' data frame. The structure of the returned object is a nested data frame, #' where the first column contains name of the variable for which frequencies #' were calculated, and the second column is a list column that contains the #' frequency tables as data frame. See [as.table.datawizard_table]. #' #' There is also an `as.table()` method, which returns a table object with the #' frequencies of the variable. This is useful for further statistical analysis, #' e.g. for using `chisq.test()` on the frequency table. See #' [as.table.datawizard_table]. #' #' @section Crosstables: #' If `by` is supplied, a crosstable is created. The crosstable includes `` #' (missing) values by default. The first column indicates values of `x`, the #' first row indicates values of `by` (including missing values). The last row #' and column contain the total frequencies for each row and column, respectively. #' Setting `remove_na = FALSE` will omit missing values from the crosstable. #' Setting `proportions` to `"row"` or `"column"` will add row or column #' percentages. Setting `proportions` to `"full"` will add relative frequencies #' for the full table. #' #' @note #' There are `print_html()` and `print_md()` methods available for printing #' frequency or crosstables in HTML and markdown format, e.g. #' `print_html(data_tabulate(x))`. The `print()` method for text outputs passes #' arguments in `...` to [`insight::export_table()`]. #' #' @return A data frame, or a list of data frames, with one frequency table #' as data frame per variable. #' #' @seealso [as.prop.table] #' #' @examplesIf requireNamespace("poorman") #' # frequency tables ------- #' # ------------------------ #' data(efc) #' #' # vector/factor #' data_tabulate(efc$c172code) #' #' # drop missing values #' data_tabulate(efc$c172code, remove_na = TRUE) #' #' # data frame #' data_tabulate(efc, c("e42dep", "c172code")) #' #' # grouped data frame #' suppressPackageStartupMessages(library(poorman, quietly = TRUE)) #' efc %>% #' group_by(c172code) %>% #' data_tabulate("e16sex") #' #' # collapse tables #' efc %>% #' group_by(c172code) %>% #' data_tabulate("e16sex", collapse = TRUE) #' #' # for larger N's (> 100000), a big mark is automatically added #' set.seed(123) #' x <- sample(1:3, 1e6, TRUE) #' data_tabulate(x, name = "Large Number") #' #' # to remove the big mark, use "print(..., big_mark = "")" #' print(data_tabulate(x), big_mark = "") #' #' # weighted frequencies #' set.seed(123) #' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) #' data_tabulate(efc$e42dep, weights = efc$weights) #' #' # crosstables ------ #' # ------------------ #' #' # add some missing values #' set.seed(123) #' efc$e16sex[sample.int(nrow(efc), 5)] <- NA #' #' data_tabulate(efc, "c172code", by = "e16sex") #' #' # add row and column percentages #' data_tabulate(efc, "c172code", by = "e16sex", proportions = "row") #' data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") #' #' # omit missing values #' data_tabulate( #' efc$c172code, #' by = efc$e16sex, #' proportions = "column", #' remove_na = TRUE #' ) #' #' # round percentages #' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") #' print(out, digits = 0) #' @export data_tabulate <- function(x, ...) { UseMethod("data_tabulate") } #' @rdname data_tabulate #' @export data_tabulate.default <- function( x, by = NULL, drop_levels = FALSE, weights = NULL, remove_na = FALSE, proportions = NULL, name = NULL, verbose = TRUE, ... ) { # save label attribute, before it gets lost... var_label <- attr(x, "label", exact = TRUE) # save and fix variable name, check for grouping variable obj_name <- tryCatch( insight::safe_deparse(substitute(x)), error = function(e) NULL ) if (identical(obj_name, "x[[i]]")) { obj_name <- name } group_variable <- list(...)$group_variable # check whether levels not present in data should be shown or not if (is.factor(x) && isTRUE(drop_levels)) { x <- droplevels(x) } # validate "weights" weights <- .validate_table_weights( weights, x, weights_expression = insight::safe_deparse(substitute(weights)) ) # we go into another function for crosstables here... if (!is.null(by)) { by <- .validate_by(by, x) return(.crosstable( x, by = by, weights = weights, remove_na = remove_na, proportions = proportions, obj_name = obj_name, group_variable = group_variable )) } # frequency table if (is.null(weights)) { if (remove_na) { # we have a `.default` and a `.data.frame` method for `data_tabulate()`. # since this is the default, `x` can be an object which cannot be used # with `table()`, that's why we add `tryCatch()` here. Below we give an # informative error message for non-supported objects. freq_table <- tryCatch(table(x), error = function(e) NULL) } else { freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) } } else if (remove_na) { # weighted frequency table, excluding NA freq_table <- tryCatch( stats::xtabs( weights ~ x, data = data.frame(weights = weights, x = x), na.action = stats::na.omit, addNA = FALSE ), error = function(e) NULL ) } else { # weighted frequency table, including NA freq_table <- tryCatch( stats::xtabs( weights ~ x, data = data.frame(weights = weights, x = addNA(x)), na.action = stats::na.pass, addNA = TRUE ), error = function(e) NULL ) } if (is.null(freq_table)) { insight::format_warning(paste0( "Can't compute frequency tables for objects of class `", class(x)[1], "`." )) return(NULL) } # create data frame with freq table and cumulative percentages etc. out <- data_rename( data.frame(freq_table, stringsAsFactors = FALSE), replacement = c("Value", "N") ) # we want to round N for weighted frequencies if (!is.null(weights)) { out$N <- round(out$N) } out$`Raw %` <- 100 * out$N / sum(out$N) # if we have missing values, we add a row with NA if (remove_na) { out$`Valid %` <- 100 * out$N / sum(out$N) valid_n <- sum(out$N, na.rm = TRUE) } else { out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE) } out$`Cumulative %` <- cumsum(out$`Valid %`) # add information about variable/group names if (!is.null(obj_name)) { if (is.null(group_variable)) { var_info <- data.frame(Variable = obj_name, stringsAsFactors = FALSE) } else { var_info <- data.frame( Variable = obj_name, Group = toString(lapply(colnames(group_variable), function(i) { sprintf("%s (%s)", i, group_variable[[i]]) })), stringsAsFactors = FALSE ) } out <- cbind(var_info, out) } # save information attr(out, "type") <- .variable_type(x) attr(out, "varname") <- name attr(out, "label") <- var_label attr(out, "object") <- obj_name attr(out, "group_variable") <- group_variable attr(out, "duplicate_varnames") <- duplicated(out$Variable) attr(out, "weights") <- weights attr(out, "total_n") <- sum(out$N, na.rm = TRUE) attr(out, "valid_n") <- valid_n class(out) <- c("datawizard_table", "data.frame") out } #' @rdname data_tabulate #' @export data_tabulate.data.frame <- function( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, by = NULL, drop_levels = FALSE, weights = NULL, remove_na = FALSE, proportions = NULL, collapse = FALSE, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # validate "by" by <- .validate_by(by, x) # validate "weights" weights <- .validate_table_weights(weights, x) out <- lapply(select, function(i) { data_tabulate( x[[i]], by = by, proportions = proportions, drop_levels = drop_levels, weights = weights, remove_na = remove_na, name = i, verbose = verbose, ... ) }) if (is.null(by)) { class(out) <- c("datawizard_tables", "list") } else { class(out) <- c("datawizard_crosstabs", "list") } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) out } #' @export data_tabulate.grouped_df <- function( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, by = NULL, proportions = NULL, drop_levels = FALSE, weights = NULL, remove_na = FALSE, collapse = FALSE, verbose = TRUE, ... ) { grps <- attr(x, "groups", exact = TRUE) group_variables <- data_remove(grps, ".rows") grps <- grps[[".rows"]] # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) x <- as.data.frame(x) out <- list() for (i in seq_along(grps)) { rows <- grps[[i]] # save information about grouping factors if (is.null(group_variables)) { group_variable <- NULL } else { group_variable <- group_variables[i, , drop = FALSE] } out <- c( out, data_tabulate( data_filter(x, rows), select = select, exclude = exclude, ignore_case = ignore_case, verbose = verbose, drop_levels = drop_levels, weights = weights, remove_na = remove_na, by = by, proportions = proportions, group_variable = group_variable, ... ) ) } if (is.null(by)) { class(out) <- c("datawizard_tables", "list") } else { class(out) <- c("datawizard_crosstabs", "list") } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) out } # methods -------------------- #' @importFrom insight print_html #' @export insight::print_html #' @importFrom insight print_md #' @export insight::print_md #' @importFrom insight display #' @export insight::display #' Convert a crosstable to a frequency or a propensity table #' #' @description #' `as.prop.table()` is an S3 generic. It can be used on objects of class #' `datawizard_crosstab` created by `data_tabulate()` when it was run with the #' arguments `by` and `proportions`. #' #' @param x An object created by `data_tabulate()`. It must be of class #' `datawizard_crosstab` for `as.prop.table()`. #' @param simplify Logical, if `TRUE`, the returned table is simplified to a #' single table object if there is only one frequency or contingency table #' input. Else, always for multiple table inputs or when `simplify = FALSE`, a #' list of tables is returned. This is only relevant for the `as.table()` #' methods. To ensure consistent output, the default is `FALSE`. #' @inheritParams data_tabulate #' #' @export #' @seealso [data_tabulate] #' #' @examples #' data(efc) #' #' # Some cross tabulation #' cross <- data_tabulate(efc, select = "e42dep", by = "c172code", proportions = "row") #' cross #' #' # Convert to a propensity table #' as.prop.table(cross) #' #' # Convert to data.frame #' result <- data_tabulate(efc, "c172code", by = "e16sex") #' as.data.frame(result) #' as.data.frame(result)$table #' as.data.frame(result, add_total = TRUE)$table #' #' # Convert to a table that can be passed to chisq.test() #' #' out <- data_tabulate(efc, "c172code", by = "e16sex") #' # we need to simplify the output, else we get a list of tables #' tbl <- as.table(out, simplify = TRUE) #' tbl #' suppressWarnings(chisq.test(tbl)) #' #' # apply chisq.test to each table #' out <- data_tabulate(efc, c("c172code", "e16sex")) #' suppressWarnings(lapply(as.table(out), chisq.test)) #' #' # can also handle grouped data frames #' d <- data_group(mtcars, "am") #' x <- data_tabulate(d, "cyl", by = "gear") #' as.table(x) as.prop.table <- function(x, ...) { UseMethod("as.prop.table") } #' @rdname as.prop.table #' @export as.prop.table.datawizard_crosstab <- function( x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ... ) { # sanity check - the `.data.frame` method returns a list, but not the # default method if (!is.data.frame(x)) { x <- x[[1]] } prop_table <- attributes(x)$prop_table if (is.null(prop_table)) { insight::format_warning("No proportions available.") return(NULL) } if (remove_na) { if ( verbose && ("NA" %in% colnames(prop_table) || "NA" %in% rownames(prop_table)) ) { insight::format_alert("Removing NA values from frequency table.") } if (!is.null(prop_table[["NA"]])) { prop_table[["NA"]] <- NULL } if ("NA" %in% rownames(prop_table)) { prop_table <- prop_table[rownames(prop_table) != "NA", ] } } # coerce to table result <- as.table(as.matrix(prop_table)) # if we don't want to simplify the table, we wrap it into a list if (!simplify) { result <- list(result) } result } #' @export as.prop.table.datawizard_crosstabs <- function( x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ... ) { # only show message once we set `verbose = FALSE` in the lapply() if (remove_na && verbose) { prop_table <- attributes(x[[1]])$prop_table if ("NA" %in% colnames(prop_table) || "NA" %in% rownames(prop_table)) { insight::format_alert("Removing NA values from frequency table.") } } out <- insight::compact_list(lapply( x, as.prop.table.datawizard_crosstab, remove_na = remove_na, simplify = TRUE, verbose = FALSE, ... )) # if no proportions found, return NULL if (!length(out)) { return(NULL) } # if only one table is returned, "unlist" if (length(out) == 1 && simplify) { out <- out[[1]] } out } # as.data.frame -------------------- #' @rdname as.prop.table #' @param add_total For crosstables (i.e. when `by` is not `NULL`), a row and #' column with the total N values are added to the data frame. `add_total` has #' no effect in `as.data.frame()` for simple frequency tables. #' @inheritParams base::as.data.frame #' @export as.data.frame.datawizard_tables <- function( x, row.names = NULL, optional = FALSE, ..., stringsAsFactors = FALSE, add_total = FALSE ) { # extract variables of frequencies selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname)) # coerce to data frame, remove rownames data_frames <- lapply(x, function(i) { # the `format()` methods for objects returned by `data_tabulate()` call # `as.data.frame()` - we have to pay attention to avoid infinite iterations # here. At the moment, this is no problem, as objects we have at this stage # are of class "datawizard_table" or "datawizard_crosstab", while this # `as.data.frame()` method is only called for "datawizard_tables" (the plural) # form). Else, we would need to modify the class attribute here, # e.g. class(i) <- "data.frame" if (add_total) { # to add the total column and row, we simply can call `format()` out <- as.data.frame(format(i)) for (cols in 2:ncol(out)) { # since "format()" returns a character matrix, we want to convert # the columns to numeric. We have to exclude the first column, as the # first column is character, due to the added "Total" value. out[[cols]] <- as.numeric(out[[cols]]) } # after formatting, we have a "separator" row for nicer printing. # this should also be removed out <- remove_empty_rows(out) } else { out <- as.data.frame(i) } rownames(out) <- NULL out }) # create nested data frame result <- data.frame( var = selected_vars, table = I(data_frames), stringsAsFactors = stringsAsFactors ) # consider additional arguments rownames(result) <- row.names result } #' @export as.data.frame.datawizard_crosstabs <- as.data.frame.datawizard_tables # as.table -------------------- #' @rdname as.prop.table #' @export as.table.datawizard_table <- function( x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ... ) { # sanity check - the `.data.frame` method (data_tabulate(mtcars, "cyl")) # returns a list, but not the default method (data_tabulate(mtcars$cyl)) if (!is.data.frame(x)) { x <- x[[1]] } # check if any table has NA values - the column "Value" contains the value # "NA", and the column "N" contains the frequency of this value. if (remove_na) { # .check_table_na() works on lists of data frames, so we wrap the data frame # into a list here if (verbose && .check_table_na(list(x))) { insight::format_alert("Removing NA values from frequency table.") } # remove NA values from the table x <- x[!is.na(x$Value), , drop = FALSE] } # coerce to table result <- as.table(stats::setNames(x[["N"]], x$Value)) # if we don't want to simplify the table, we wrap it into a list if (!simplify) { result <- list(result) } result } #' @export as.table.datawizard_tables <- function( x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ... ) { # only show message once we set `verbose = FALSE` in the lapply() if (remove_na && verbose && .check_table_na(x)) { insight::format_alert("Removing NA values from frequency table.") } out <- lapply( x, as.table.datawizard_table, remove_na = remove_na, # no nested lists simplify = TRUE, # no multiple messages verbose = FALSE, ... ) # if only one table is returned, "unlist" if (length(out) == 1 && simplify) { out <- out[[1]] } out } #' @export as.table.datawizard_crosstab <- function( x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ... ) { # sanity check - the `.data.frame` method returns a list, but not the # default method if (!is.data.frame(x)) { x <- x[[1]] } # check for grouped df - we need to remove the "Group" column if (.is_grouped_df_xtab(x)) { x$Group <- NULL } # first column contains the row names row_names <- as.character(x[[1]]) row_names[is.na(row_names)] <- "NA" # remove first column, set rownames x[[1]] <- NULL rownames(x) <- row_names if (remove_na) { if (verbose && .check_xtable_na(list(x))) { insight::format_alert("Removing NA values from frequency table.") } if (!is.null(x[["NA"]])) { x[["NA"]] <- NULL } if ("NA" %in% row_names) { x <- x[row_names != "NA", ] } } # coerce to table result <- as.table(as.matrix(x)) # if we don't want to simplify the table, we wrap it into a list if (!simplify) { result <- list(result) } result } #' @export as.table.datawizard_crosstabs <- function( x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ... ) { # only show message once we set `verbose = FALSE` in the lapply() if (remove_na && verbose && .check_xtable_na(x)) { insight::format_alert("Removing NA values from frequency table.") } out <- lapply( x, as.table.datawizard_crosstab, remove_na = remove_na, simplify = TRUE, verbose = FALSE, ... ) # if only one table is returned, "unlist" if (length(out) == 1 && simplify) { out <- out[[1]] } # if we have a grouped data frame, we save the grouping values as # names for the list if (.is_grouped_df_xtab(x)) { names(out) <- unlist( lapply(x, function(i) { i$Group[1] }), use.names = FALSE ) } out } .is_grouped_df_xtab <- function(x) { if (!is.data.frame(x)) { x <- x[[1]] } isTRUE(attributes(x)$grouped_df) } .check_table_na <- function(x) { # check if any table has NA values - the column "Value" contains the value # "NA", and the column "N" contains the frequency of this value. any(vapply(x, function(i) any(i$N[is.na(i$Value)] > 0), logical(1))) } .check_xtable_na <- function(x) { any(vapply( x, function(i) { # need to extract rownames, to check if we have a "NA" row row_names <- as.character(i[[1]]) row_names[is.na(row_names)] <- "NA" has_na <- FALSE # check for NA columns and rows if (!is.null(i[["NA"]])) { has_na <- any(i[["NA"]] > 0) } if ("NA" %in% row_names) { # for grouped data frames, we need to remove the "Group" column, else # the indexing -1 below won't work if (.is_grouped_df_xtab(i)) { i$Group <- NULL } # we need "as.data.frame()" for grouped df, else `as.vector()` fails has_na <- has_na | any(as.vector(as.data.frame(i[row_names == "NA", -1])) > 0) } has_na }, logical(1) )) } # format -------------------- #' @export format.datawizard_table <- function(x, format = "text", big_mark = NULL, ...) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) x$N <- as.character(x$N) # format data frame ftab <- insight::format_table(x, ...) ftab[] <- lapply(ftab, function(i) { i[i == ""] <- ifelse(identical(format, "text"), "", "(NA)") # nolint i }) ftab$N <- gsub("\\.00$", "", ftab$N) # insert big marks? ftab$N <- .add_commas_in_numbers(ftab$N, big_mark) ftab } .add_commas_in_numbers <- function(x, big_mark = NULL) { # sanity checks - for crosstables with `remove_na = FALSE`, nchar(x) fails, # and pretty() warns about non-numeric input. Thus, we skip if any NA value # is in `x`. if (anyNA(x)) { return(x) } # automatically add a big mark for large numbers if (is.null(big_mark) && any(nchar(x) > 5)) { big_mark <- "," } if (identical(big_mark, "")) { return(x) } if (!is.null(big_mark)) { x <- prettyNum(x, big.mark = big_mark) } x } # print -------------------- #' @rdname data_tabulate #' @export print.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type cat(.table_header(x, "text")) # grouped data? if yes, add information on grouping factor if (!is.null(a$group_variable)) { group_title <- paste0( "Grouped by ", toString(lapply(colnames(a$group_variable), function(i) { sprintf("%s (%s)", i, a$group_variable[[i]]) })) ) cat(insight::print_color(group_title, "blue")) cat("\n") } a$total_n <- .add_commas_in_numbers(a$total_n, big_mark) a$valid_n <- .add_commas_in_numbers(a$valid_n, big_mark) # summary of total and valid N (we may add mean/sd as well?) summary_line <- sprintf( "# total N=%s valid N=%s%s\n\n", a$total_n, a$valid_n, ifelse(is.null(a$weights), "", " (weighted)") ) cat(insight::print_color(summary_line, "blue")) # remove information that goes into the header/footer x$Variable <- NULL x$Group <- NULL # print table cat(insight::export_table( format(x, big_mark = big_mark, ...), cross = "+", missing = "", ... )) invisible(x) } #' @export print.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) a <- attributes(x) if (!isTRUE(a$collapse) || length(x) == 1) { for (i in seq_along(x)) { print(x[[i]], big_mark = big_mark, ...) if (i < length(x)) cat("\n") } } else { x <- lapply(x, function(i) { i_attr <- attributes(i) i <- format(i, format = "text", big_mark = big_mark, ...) if (!is.null(i$Variable)) { i$Variable[i_attr$duplicate_varnames] <- "" } if (!is.null(i$Group)) { i$Group[i_attr$duplicate_varnames] <- "" } i[nrow(i) + 1, ] <- "" i }) out <- do.call(rbind, x) if (is_weighted) { cat(insight::print_color("# Frequency Table (weighted)\n\n", "blue")) } else { cat(insight::print_color("# Frequency Table\n\n", "blue")) } # print table cat(insight::export_table( out, missing = "", cross = "+", empty_line = "-", ... )) } } # display -------------------- #' @rdname data_tabulate #' @export display.datawizard_table <- function( object, big_mark = NULL, format = "markdown", ... ) { format <- .display_default_format(format) fun_args <- list( x = object, big_mark = big_mark, ... ) # print table in HTML or markdown format if (format %in% c("html", "tt")) { fun_args$backend <- format do.call(print_html, fun_args) } else { do.call(print_md, fun_args) } } #' @export display.datawizard_tables <- display.datawizard_table #' @export display.datawizard_crosstab <- display.datawizard_table #' @export display.datawizard_crosstabs <- display.datawizard_table .display_default_format <- function(format) { format <- getOption("easystats_display_format", format) insight::validate_argument(format, c("markdown", "html", "md", "tt")) } # print_html -------------------- #' @export print_html.datawizard_table <- function(x, big_mark = NULL, ...) { .print_dw_table(x, format = "html", big_mark = big_mark, ...) } #' @export print_html.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) if (length(x) == 1) { print_html(x[[1]], big_mark = big_mark, ...) } else { x <- lapply(x, function(i) { i_attr <- attributes(i) i <- format(i, format = "html", big_mark = big_mark, ...) if (!is.null(i$Variable)) { i$Variable[i_attr$duplicate_varnames] <- "" } i }) out <- do.call(rbind, x) # print table insight::export_table( out, missing = "", caption = ifelse( is_weighted, "Frequency Table (weighted)", "Frequency Table" ), format = .check_format_backend(...), group_by = "Group" ) } } # print_md -------------------- #' @export print_md.datawizard_table <- function(x, big_mark = NULL, ...) { .print_dw_table(x, format = "markdown", big_mark = big_mark, ...) } #' @export print_md.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) if (length(x) == 1) { print_md(x[[1]], big_mark = big_mark, ...) } else { x <- lapply(x, function(i) { i_attr <- attributes(i) i <- format(i, format = "markdown", big_mark = big_mark, ...) if (!is.null(i$Variable)) { i$Variable[i_attr$duplicate_varnames] <- "" } if (!is.null(i$Group)) { i$Group[i_attr$duplicate_varnames] <- "" } i[nrow(i) + 1, ] <- "" i }) out <- do.call(rbind, x) # print table insight::export_table( out, missing = "(NA)", empty_line = "-", format = "markdown", title = ifelse( is_weighted, "Frequency Table (weighted)", "Frequency Table" ) ) } } # tools -------------------- .print_dw_table <- function(x, format = "markdown", big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type caption <- .table_header(x, format) # summary of total and valid N (we may add mean/sd as well?) footer <- sprintf( "total N=%i valid N=%i%s%s", a$total_n, a$valid_n, ifelse(is.null(a$weights), "", " (weighted)"), ifelse(format == "markdown", "\n\n", "") ) # remove information that goes into the header/footer x$Variable <- NULL x$Group <- NULL # this function is used by all four supported format, markdown, text, html # and tt (tinytable). For tt, we sometimes have format "html" and backend = "tt", # so we need to check for this special case backend <- switch(format, html = , tt = .check_format_backend(...), format) # print table insight::export_table( format(x, format = format, big_mark = big_mark, ...), title = caption, footer = footer, missing = "(NA)", format = backend ) } # we allow exporting HTML format based on "gt" or "tinytable" .check_format_backend <- function(...) { dots <- list(...) if (identical(dots$backend, "tt")) { "tt" } else { "html" } } .table_header <- function(x, format = "text") { a <- attributes(x) # assemble name, based on what information is available name <- NULL # fix object name if (identical(a$object, "x[[i]]")) { a$object <- NULL } if (!is.null(a$label)) { name <- a$label if (!is.null(a$varname)) { name <- paste0(name, " (", a$varname, ")") } else if (!is.null(a$object)) { name <- paste0(name, " (", a$object, ")") } } else if (!is.null(a$varname)) { name <- a$varname if (!is.null(a$object)) { name <- paste0(name, " (", a$object, ")") } } if (is.null(name) && !is.null(a$object)) { name <- a$object } # "table" header with variable label/name, and type if (identical(format, "text")) { out <- paste( insight::color_text(name, "red"), insight::color_text(sprintf("<%s>\n", a$type), "blue") ) } else { out <- paste0(name, " (", a$type, ")") } out } .variable_type <- function(x) { if (is.ordered(x)) { vt <- "ord" } else if (is.factor(x)) { vt <- "fct" } else if (class(x)[1] == "Date") { vt <- "date" } else { vt <- switch( typeof(x), logical = "lgl", integer = "int", double = "dbl", character = "chr", complex = "cpl", closure = "fn", environment = "env", typeof(x) ) } switch( vt, ord = "ordinal", fct = "categorical", dbl = "numeric", int = "integer", chr = "character", lbl = "labelled", cpl = "complex", lgl = "logical", vt ) } ================================================ FILE: R/data_to_long.R ================================================ #' @title Reshape (pivot) data from wide to long #' @name data_to_long #' #' @description #' This function "lengthens" data, increasing the number of rows and decreasing #' the number of columns. This is a dependency-free base-R equivalent of #' `tidyr::pivot_longer()`. #' #' @param data A data frame to convert to long format, so that it has more #' rows and fewer columns after the operation. #' @param names_to The name of the new column (variable) that will contain the #' _names_ from columns in `select` as values, to identify the source of the #' values. `names_to` can be a character vector with more than one column name, #' in which case `names_sep` or `names_pattern` must be provided in order to #' identify which parts of the column names go into newly created columns. #' See also 'Examples'. #' @param names_prefix A regular expression used to remove matching text from #' the start of each variable name. #' @param names_sep,names_pattern If `names_to` contains multiple values, this #' argument controls how the column name is broken up. `names_pattern` takes a #' regular expression containing matching groups, i.e. "()". #' @param values_to The name of the new column that will contain the _values_ of #' the columns in `select`. #' @param values_drop_na If `TRUE`, will drop rows that contain only `NA` in the #' `values_to` column. This effectively converts explicit missing values to #' implicit missing values, and should generally be used only when missing values #' in data were created by its structure. #' @param rows_to The name of the column that will contain the row names or row #' numbers from the original data. If `NULL`, will be removed. #' @param ... Currently not used. #' @inheritParams extract_column_names #' @param cols Identical to `select`. This argument is here to ensure compatibility #' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols` #' is used. #' #' @inherit data_rename seealso #' #' @details #' Reshaping data into long format usually means that the input data frame is #' in _wide_ format, where multiple measurements taken on the same subject are #' stored in multiple columns (variables). The long format stores the same #' information in a single column, with each measurement per subject stored in #' a separate row. The values of all variables that are not in `select` will #' be repeated. #' #' The necessary information for `data_to_long()` is: #' #' - The columns that contain the repeated measurements (`select`). #' - The name of the newly created column that will contain the names of the #' columns in `select` (`names_to`), to identify the source of the values. #' `names_to` can also be a character vector with more than one column name, #' in which case `names_sep` or `names_pattern` must be provided to specify #' which parts of the column names go into the newly created columns. #' - The name of the newly created column that contains the values of the #' columns in `select` (`values_to`). #' #' In other words: repeated measurements that are spread across several columns #' will be gathered into a single column (`values_to`), with the original column #' names, that identify the source of the gathered values, stored in one or more #' new columns (`names_to`). #' #' @return If a tibble was provided as input, `reshape_longer()` also returns a #' tibble. Otherwise, it returns a data frame. #' #' @examplesIf all(insight::check_if_installed(c("psych", "tidyr"), quietly = TRUE)) #' wide_data <- setNames( #' data.frame(replicate(2, rnorm(8))), #' c("Time1", "Time2") #' ) #' wide_data$ID <- 1:8 #' wide_data #' #' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3)) #' # probably doesn't make much sense to mix "time" and "id" #' data_to_long(wide_data) #' #' # Customizing the names #' data_to_long( #' wide_data, #' select = c("Time1", "Time2"), #' names_to = "Timepoint", #' values_to = "Score" #' ) #' #' # Reshape multiple columns into long format. #' mydat <- data.frame( #' age = c(20, 30, 40), #' sex = c("Female", "Male", "Male"), #' score_t1 = c(30, 35, 32), #' score_t2 = c(33, 34, 37), #' score_t3 = c(36, 35, 38), #' speed_t1 = c(2, 3, 1), #' speed_t2 = c(3, 4, 5), #' speed_t3 = c(1, 8, 6) #' ) #' # The column names are split into two columns: "type" and "time". The #' # pattern for splitting column names is provided in `names_pattern`. Values #' # of all "score_*" and "speed_*" columns are gathered into a single column #' # named "count". #' data_to_long( #' mydat, #' select = 3:8, #' names_to = c("type", "time"), #' names_pattern = "(score|speed)_t(\\d+)", #' values_to = "count" #' ) #' #' # Full example #' # ------------------ #' data <- psych::bfi # Wide format with one row per participant's personality test #' #' # Pivot long format #' very_long_data <- data_to_long(data, #' select = regex("\\d"), # Select all columns that contain a digit #' names_to = "Item", #' values_to = "Score", #' rows_to = "Participant" #' ) #' head(very_long_data) #' #' even_longer_data <- data_to_long( #' tidyr::who, #' select = new_sp_m014:newrel_f65, #' names_to = c("diagnosis", "gender", "age"), #' names_pattern = "new_?(.*)_(.)(.*)", #' values_to = "count" #' ) #' head(even_longer_data) #' @export data_to_long <- function( data, select = "all", names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, values_to = "value", values_drop_na = FALSE, rows_to = NULL, ignore_case = FALSE, regex = FALSE, ..., cols ) { # nolint original_data <- data # Prefer "cols" over "select" for compat with tidyr::pivot_longer # nolint start if (!missing(cols)) { select <- substitute(cols) cols <- .select_nse( select, data, exclude = NULL, ignore_case = ignore_case, regex = regex, ifnotfound = "error" ) } else if (!missing(select) || !is.null(select)) { cols <- .select_nse( select, data, exclude = NULL, ignore_case = ignore_case, regex = regex, ifnotfound = "error" ) } else { insight::format_error( "You need to specify columns to pivot, either with `select` or `cols`." ) } # nolint end if (length(names_to) > 1L && is.null(names_sep) && is.null(names_pattern)) { insight::format_error( "If you supply multiple names in `names_to`, you must also supply one of `names_sep` or `names_pattern`." ) } if (length(names_to) == 1L) { if (!is.null(names_sep)) { insight::format_error( "You can't use `names_sep` when `names_to` is of length 1." ) } if (!is.null(names_pattern)) { insight::format_error( "You can't use `names_pattern` when `names_to` is of length 1." ) } } # save custom attributes custom_attr <- attributes(data) # Remove tidyverse attributes, will add them back at the end if (inherits(data, "tbl_df")) { tbl_input <- TRUE data <- as.data.frame(data, stringsAsFactors = FALSE) } else { tbl_input <- FALSE } if (any(names_to %in% setdiff(names(data), cols))) { insight::format_error( "Some values of the columns specified in `names_to` are already present as column names.", paste0( "Either use another value in `names_to` or rename the following columns: ", text_concatenate(names_to[which( names_to %in% setdiff(names(data), cols) )]) ) ) } not_selected <- setdiff(names(data), cols) # create a temp id so that we know how to rearrange the rows once the data is # stacked not_stacked <- data[, not_selected, drop = FALSE] not_stacked[["_Rows"]] <- coerce_to_numeric(row.names(data)) # stack the selected columns stacked_data <- .stack(data[, cols, drop = FALSE])[, 2:1] # reorder the rows to have a repeated sequence when all vars are selected to # pivot # # See with following example: # wide_data <- data.frame(replicate(5, rnorm(10))) # data_to_long(wide_data) needs_to_rearrange <- length(not_selected) == 0L && is.null(rows_to) if (isTRUE(needs_to_rearrange)) { # https://stackoverflow.com/questions/73984957/efficient-way-to-reorder-rows-to-have-a-repeated-sequence stacked_data <- stacked_data[ matrix( seq_len(nrow(stacked_data)), nrow = length(unique(stacked_data$ind)), byrow = TRUE ), ] row.names(stacked_data) <- NULL } stacked_data <- data_rename(stacked_data, "values", values_to) # split columns if several names in names_to or names_pattern is specified if (length(names_to) > 1L) { if (is.null(names_pattern)) { # faster than strsplit tmp <- utils::read.csv( text = stacked_data$ind, sep = names_sep, stringsAsFactors = FALSE, header = FALSE ) names(tmp) <- paste0("V", seq_len(ncol(tmp))) tmp[tmp == ""] <- NA # nolint stacked_data$ind <- NULL stacked_data <- cbind(tmp, stacked_data) } else { tmp <- regmatches( unique(stacked_data$ind), regexec(names_pattern, unique(stacked_data$ind)) ) tmp <- as.data.frame(do.call(rbind, tmp), stringsAsFactors = FALSE) names(tmp) <- c("ind", names_to) # cbind + match is faster than merge # cbind doesn't remove identical columns so we need to manually remove "ind" # which is in both datasets stacked_data <- cbind( stacked_data, tmp[match(stacked_data[["ind"]], tmp[["ind"]]), -1] ) stacked_data$ind <- NULL } } stacked_data <- data_relocate(stacked_data, select = values_to, after = -1) # if columns in data frame have attributes (e.g. labelled data), `cbind()` # won't work, so we need to remove them. We'll set them back later not_stacked[] <- lapply(not_stacked, function(i) { # we can't remove *all* attributes, this will convert factors into integers attr(i, "label") <- NULL attr(i, "labels") <- NULL attr(i, "format.spss") <- NULL class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr")) i }) # reunite unselected data with stacked data out <- cbind( not_stacked, stats::setNames(stacked_data, c(names_to, values_to)), row.names = NULL ) if (!is.null(names_prefix)) { if (length(names_to) > 1L) { insight::format_error( "`names_prefix` only works when `names_to` is of length 1." ) } out[[names_to]] <- gsub(paste0("^", names_prefix), "", out[[names_to]]) } # rearrange the rows with the temp id if (length(not_selected) > 0L) { out <- data_arrange(out, "_Rows") } # Remove or rename the row index if (is.null(rows_to)) { out[["_Rows"]] <- NULL } else { out <- data_rename(out, "_Rows", rows_to) } if (values_drop_na) { out <- out[!is.na(out[, values_to]), , drop = FALSE] } # add back attributes out <- .replace_attrs(out, custom_attr) # add back tidyverse attributes if (isTRUE(tbl_input)) { class(out) <- c("tbl_df", "tbl", "data.frame") } # reset row names if (!insight::object_has_rownames(data)) { row.names(out) <- NULL } # set back labels shared_columns <- intersect(colnames(out), colnames(original_data)) for (i in shared_columns) { out[[i]] <- .set_back_labels( out[[i]], original_data[[i]], include_values = TRUE ) } out } #' Code adapted from utils::stack (but largely modified) #' #' @noRd .stack <- function(x) { ind <- rep(names(x), times = lengths(x)) # use do.call("c", ...) instead of unlist to preserve the date format (but a # bit slower) # can't use do.call("c", ...) all the time because its behavior changed with # factors in 4.1.0 values_are_dates <- all( vapply(x, .is_date, FUN.VALUE = logical(1L)) ) if (values_are_dates) { data.frame(values = do.call("c", unname(x)), ind, stringsAsFactors = FALSE) } else { data.frame( values = unlist(x, use.names = FALSE), ind, stringsAsFactors = FALSE ) } } #' @rdname data_to_long #' @export reshape_longer <- data_to_long ================================================ FILE: R/data_to_wide.R ================================================ #' Reshape (pivot) data from long to wide #' #' This function "widens" data, increasing the number of columns and decreasing #' the number of rows. This is a dependency-free base-R equivalent of #' `tidyr::pivot_wider()`. #' #' @param data A data frame to convert to wide format, so that it has more #' columns and fewer rows post-widening than pre-widening. #' @param id_cols The name of the column that identifies the rows in the data #' by which observations are grouped and the gathered data is spread into new #' columns. Usually, this is a variable containing an ID for observations that #' have been repeatedly measured. If `NULL`, it will use all remaining columns #' that are not in `names_from` or `values_from` as ID columns. `id_cols` can #' also be a character vector with more than one name of identifier columns. See #' also 'Details' and 'Examples'. #' @param names_from The name of the column in the original data whose values #' will be used for naming the new columns created in the widened data. Each #' unique value in this column will become the name of one of these new columns. #' In case `names_prefix` is provided, column names will be concatenated with #' the string given in `names_prefix`. If `values_from` specifies more than one #' variable that should be widened, the new column names are a combination of #' the old column names in `values_from` and the *values* from `names_from`, to #' avoid duplicate column names. #' @param names_prefix String added to the start of every variable name. This is #' particularly useful if `names_from` is a numeric vector and you want to create #' syntactic variable names. #' @param names_sep If `names_from` or `values_from` contains multiple variables, #' this will be used to join their values together into a single string to use #' as a column name. #' @param names_glue Instead of `names_sep` and `names_prefix`, you can supply a #' [glue specification](https://glue.tidyverse.org/index.html) that uses the #' `names_from` columns to create custom column names. Note that the only #' delimiters supported by `names_glue` are curly brackets, `{` and `}`. #' @param values_from The name of the columns in the original data that contains #' the values used to fill the new columns created in the widened data. Can also #' be one of the selection helpers (see argument `select` in [`data_select()`]). #' @param values_fill Defunct argument, which has no function anymore. Will be #' removed in future versions. #' @param verbose Toggle warnings. #' @param ... Not used for now. #' @inheritParams data_select #' #' @return If a tibble was provided as input, `data_to_wide()` also returns a #' tibble. Otherwise, it returns a data frame. #' #' @details #' Reshaping data into wide format usually means that the input data frame is #' in _long_ format, where multiple measurements taken on the same subject are #' stored in multiple rows. The wide format stores the same information in a #' single row, with each measurement stored in a separate column. Thus, the #' necessary information for `data_to_wide()` is: #' #' - The name of the column(s) that identify the groups or repeated measurements #' (`id_cols`). #' - The name of the column whose _values_ will become the new column names #' (`names_from`). Since these values may not necessarily reflect appropriate #' column names, you can use `names_prefix` to add a prefix to each newly #' created column name. #' - The name of the column(s) that contain the values (`values_from`) for the #' new columns that are created by `names_from`. #' #' In other words: repeated measurements, as indicated by `id_cols`, that are #' saved into the column `values_from` will be spread into new columns, which #' will be named after the values in `names_from`. See also 'Examples'. #' #' @examplesIf requireNamespace("lme4", quietly = TRUE) #' data_long <- read.table(header = TRUE, text = " #' subject sex condition measurement #' 1 M control 7.9 #' 1 M cond1 12.3 #' 1 M cond2 10.7 #' 2 F control 6.3 #' 2 F cond1 10.6 #' 2 F cond2 11.1 #' 3 F control 9.5 #' 3 F cond1 13.1 #' 3 F cond2 13.8 #' 4 M control 11.5 #' 4 M cond1 13.4 #' 4 M cond2 12.9") #' #' # converting long data into wide format #' data_to_wide( #' data_long, #' id_cols = "subject", #' names_from = "condition", #' values_from = "measurement" #' ) #' #' # converting long data into wide format with custom column names #' data_to_wide( #' data_long, #' id_cols = "subject", #' names_from = "condition", #' values_from = "measurement", #' names_prefix = "Var.", #' names_sep = "." #' ) #' #' # converting long data into wide format, combining multiple columns #' production <- expand.grid( #' product = c("A", "B"), #' country = c("AI", "EI"), #' year = 2000:2014 #' ) #' production <- data_filter(production, (product == "A" & country == "AI") | product == "B") #' production$production <- rnorm(nrow(production)) #' #' data_to_wide( #' production, #' names_from = c("product", "country"), #' values_from = "production", #' names_glue = "prod_{product}_{country}" #' ) #' #' # reshaping multiple long columns into wide format. to avoid duplicate #' # column names, new names are a combination of the old column names in #' # `values_from` and the values from `names_from` #' data_long <- read.table(header = TRUE, text = " #' subject_id time score anxiety test #' 1 1 10 5 NA #' 1 2 NA 7 NA #' 2 1 15 6 NA #' 2 2 12 NA NA #' 3 1 18 8 NA #' 5 2 11 4 NA #' 4 1 NA 5 NA #' 4 2 14 NA NA") #' #' data_to_wide( #' data_long, #' id_cols = "subject_id", #' names_from = "time", #' values_from = c("score", "anxiety", "test") #' ) #' #' # using the "sleepstudy" dataset #' data(sleepstudy, package = "lme4") #' #' # the sleepstudy data contains repeated measurements of average reaction #' # times for each subjects over multiple days, in a sleep deprivation study. #' # It is in long-format, i.e. each row corresponds to a single measurement. #' # The variable "Days" contains the timepoint of the measurement, and #' # "Reaction" contains the measurement itself. Converting this data to wide #' # format will create a new column for each day, with the reaction time as the #' # value. #' head(sleepstudy) #' #' data_to_wide( #' sleepstudy, #' id_cols = "Subject", #' names_from = "Days", #' values_from = "Reaction" #' ) #' #' # clearer column names #' data_to_wide( #' sleepstudy, #' id_cols = "Subject", #' names_from = "Days", #' values_from = "Reaction", #' names_prefix = "Reaction_Day_" #' ) #' #' # For unequal group sizes, missing information is filled with NA #' d <- subset(sleepstudy, Days %in% c(0, 1, 2, 3, 4))[c(1:9, 11:13, 16:17, 21), ] #' #' # long format, different number of "Subjects" #' d #' #' data_to_wide( #' d, #' id_cols = "Subject", #' names_from = "Days", #' values_from = "Reaction", #' names_prefix = "Reaction_Day_" #' ) #' @inherit data_rename seealso #' @export data_to_wide <- function( data, id_cols = NULL, values_from = "Value", names_from = "Name", names_sep = "_", names_prefix = "", names_glue = NULL, values_fill = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { ## TODO: remove in a future update (#645) if (!is.null(values_fill)) { insight::format_warning( "`values_fill` is defunct and has no function anymore. It will be removed in future versions.", "To handle missing values after widening, use `convert_na_to()` instead." ) } if (is.null(names_from) || !all(names_from %in% colnames(data))) { insight::format_error( "`names_from` must be the name of an existing column in `data`." ) } select <- substitute(values_from) values_from <- .select_nse( select, data, exclude = NULL, ignore_case, regex = regex, verbose = verbose ) if (is.null(values_from) || !length(values_from)) { insight::format_error( "No variable defined in `values_from` was found in the `data`." ) } if (is.null(id_cols)) { id_cols <- setdiff(colnames(data), c(names_from, values_from)) } else if (!all(id_cols %in% colnames(data))) { insight::format_error( "`id_cols` must be the names of existing columns in `data`." ) } # save custom attributes custom_attr <- attributes(data) current_colnames <- names(data) # Preserve attributes if (inherits(data, "tbl_df")) { tbl_input <- TRUE data <- as.data.frame(data, stringsAsFactors = FALSE) } else { tbl_input <- FALSE } variable_attr <- lapply(data, attributes) not_unstacked <- data[, id_cols, drop = FALSE] not_unstacked <- unique(not_unstacked) # unstack doesn't create NAs for combinations that don't exist (contrary to # reshape), so we need to complete the dataset before unstacking. new_data <- data # create an id with all variables that are not in names_from or values_from # so that we can create missing combinations between this id and names_from if (length(id_cols) > 1L) { new_data$temporary_id <- do.call( paste, c(new_data[, id_cols, drop = FALSE], sep = "_") ) } else if (length(id_cols) == 1L) { new_data$temporary_id <- new_data[[id_cols]] } else { new_data$temporary_id <- seq_len(nrow(new_data)) } # check that all_groups have all possible values for names_from # If not, need to complete the dataset with NA for values_from where names_from # didn't exist n_rows_per_group <- table(new_data$temporary_id) n_values_per_group <- insight::n_unique(n_rows_per_group) not_all_cols_are_selected <- length(id_cols) > 0L incomplete_groups <- (n_values_per_group > 1L && !all( unique(n_rows_per_group) %in% insight::n_unique(new_data[, names_from]) )) || (n_values_per_group == 1L && unique(n_rows_per_group) < length(unique(new_data[, names_from]))) # create missing combinations if (not_all_cols_are_selected && incomplete_groups) { expanded <- expand.grid( unique(new_data[["temporary_id"]]), unique(new_data[[names_from]]) ) names(expanded) <- c("temporary_id", names_from) new_data <- data_merge( new_data, expanded, join = "full", by = c("temporary_id", names_from), sort = FALSE ) # need to make a second temporary id to keep arrange values *without* # rearranging the whole dataset # Ex: # "B" 1 # "A" 3 # "A" NA # "B" NA # # must be rearranged as "B" "B" "A" "A" and not "A" "A" "B" "B" lookup <- data.frame( temporary_id = unique( new_data[!is.na(new_data[values_from]), "temporary_id"] ) ) lookup$temporary_id_2 <- seq_len(nrow(lookup)) new_data <- data_merge( new_data, lookup, by = "temporary_id", join = "left" ) # creation of missing combinations was done with a temporary id, so need # to fill columns that are not selected in names_from or values_from new_data[, id_cols] <- lapply(id_cols, function(x) { data <- data_arrange(new_data, c("temporary_id_2", x)) ind <- which(!is.na(data[[x]])) rep_times <- diff(c(ind, length(data[[x]]) + 1)) rep(data[[x]][ind], times = rep_times) }) new_data <- data_arrange(new_data, "temporary_id_2") } # don't need temporary ids anymore new_data$temporary_id <- NULL new_data$temporary_id_2 <- NULL # convert to wide format (returns the data and the order in which columns # should be ordered) unstacked <- .unstack( new_data, names_from, values_from, names_sep, names_prefix, names_glue ) out <- unstacked$out if (length(values_from) > 1L) { unstacked$col_order <- unique(data[, names_from]) unstacked$col_order <- as.vector( t(outer(values_from, unstacked$col_order, paste, sep = names_sep)) ) } # stop if some column names would be duplicated (follow tidyr workflow) if (any(unstacked$col_order %in% current_colnames)) { insight::format_error( "Some values of the columns specified in `names_from` are already present as column names.", paste0( "Either use `names_prefix` or rename the following columns: ", text_concatenate(current_colnames[which( current_colnames %in% unstacked$col_order )]) ) ) } # reorder columns out <- out[, unstacked$col_order, drop = FALSE] # need to add the wide data to the original data if (!insight::is_empty_object(not_unstacked)) { out <- cbind(not_unstacked, out) } row.names(out) <- NULL # add back attributes where possible for (i in colnames(out)) { attributes(out[[i]]) <- variable_attr[[i]] } # convert back to date if original values were dates values_are_dates <- all( vapply(data[, values_from, drop = FALSE], .is_date, FUN.VALUE = logical(1L)) ) if (values_are_dates) { for (i in unstacked$col_order) { out[[i]] <- as.Date.numeric(out[[i]], origin = "1970-01-01") } } # add back attributes out <- .replace_attrs(out, custom_attr) if (isTRUE(tbl_input)) { class(out) <- c("tbl_df", "tbl", "data.frame") } out } #' Adapted from `utils::unstack` (but largely modified) #' #' @noRd .unstack <- function( x, names_from, values_from, names_sep, names_prefix, names_glue = NULL ) { # get values from names_from (future colnames) if (is.null(names_glue)) { x$future_colnames <- do.call( paste, c(x[, names_from, drop = FALSE], sep = names_sep) ) } else { vars <- regmatches( names_glue, gregexpr("\\{\\K[^{}]+(?=\\})", names_glue, perl = TRUE) )[[1]] tmp_data <- x[, vars] x$future_colnames <- .gluestick(names_glue, src = tmp_data) } x$future_colnames <- paste0(names_prefix, x$future_colnames) # expand the values for each variable in "values_from" res <- list() for (i in seq_along(values_from)) { res[[i]] <- tapply(x[[values_from[i]]], x$future_colnames, as.vector) if (length(values_from) > 1L) { names(res[[i]]) <- paste0(values_from[i], names_sep, names(res[[i]])) } } # if there's a single variable in "values_from" and this variable only has # one value, need to make it a dataframe if (length(res) == 1L && !is.list(res[[1]])) { res <- data.frame( matrix( res[[1]], nrow = 1, dimnames = list(NULL, names(res[[1]])) ), stringsAsFactors = FALSE, check.names = FALSE ) } else { res <- unlist(res, recursive = FALSE) } # return the wide data and the order in which the new columns should be list( out = data.frame(res, stringsAsFactors = FALSE, check.names = FALSE), col_order = unique(x$future_colnames) ) } #' @rdname data_to_wide #' @export reshape_wider <- data_to_wide ================================================ FILE: R/data_unique.R ================================================ #' @title Keep only one row from all with duplicated IDs #' #' @description From all rows with at least one duplicated ID, #' keep only one. Methods for selecting the duplicated row are #' either the first duplicate, the last duplicate, or the "best" #' duplicate (default), based on the duplicate with the smallest #' number of `NA`. In case of ties, it picks the first #' duplicate, as it is the one most likely to be valid and #' authentic, given practice effects. #' #' Contrarily to `dplyr::distinct()`, `data_unique()` keeps all columns. #' #' @param keep The method to be used for duplicate selection, either "best" #' (the default), "first", or "last". #' @inheritParams extract_column_names #' #' @return A data frame, containing only the chosen duplicates. #' @seealso [data_duplicated()] #' @examples #' df1 <- data.frame( #' id = c(1, 2, 3, 1, 3), #' item1 = c(NA, 1, 1, 2, 3), #' item2 = c(NA, 1, 1, 2, 3), #' item3 = c(NA, 1, 1, 2, 3) #' ) #' #' data_unique(df1, select = "id") #' @export data_unique <- function( data, select = NULL, keep = "best", exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) { UseMethod("data_unique") } #' @export data_unique.data.frame <- function( data, select = NULL, keep = "best", exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) { select <- .select_nse( select, data, exclude = exclude, ignore_case = ignore_case, regex = regex, verbose = verbose ) # temporary_id <- paste0(sample(letters), collapse = "") data$temporary_id2 <- do.call(paste, c(data_select(data, select), sep = "_")) og.names <- names(data) dups <- data_duplicated(data, select) # if no duplicates, return the original data if (nrow(dups) == 0L) { data <- data_remove(data, "temporary_id2") return(data) } # count number of duplicates dups.n <- sum(duplicated(dups$temporary_id2)) good.dups <- data_group(dups, "temporary_id2") # keep row that has the least duplicates if (keep == "best") { good.dups <- data_filter(good.dups, "count_na == min(count_na)") } good.dups <- good.dups[ !duplicated(good.dups$temporary_id2, fromLast = keep == "last"), ] good.dups <- data_select(good.dups, og.names) out <- data[!duplicated(data$temporary_id2), , drop = FALSE] if (keep != "first") { match.index <- out$temporary_id2 %in% good.dups$temporary_id2 out[match.index, ] <- good.dups } # id is not useful anymore out <- data_remove(out, "temporary_id2") if (verbose) { dup.msg <- sprintf( "(%s duplicates removed, with method '%s')", dups.n, keep ) dup.msg <- paste0(dup.msg, ifelse(dups.n != 69, "", " 69... nice")) insight::format_alert(dup.msg) } out } #' @export data_unique.grouped_df <- function( data, select = NULL, keep = "best", exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) { select <- .select_nse( select, data, exclude = exclude, ignore_case = ignore_case, regex = regex, verbose = verbose ) grps <- attr(data, "groups", exact = TRUE) grps <- grps[[".rows"]] data2 <- as.data.frame(data_ungroup(data)) out <- lapply(grps, function(x) { data_unique.data.frame( data2[x, ], select = select, keep = keep, verbose = verbose ) }) out <- do.call(rbind, out) if (!insight::object_has_rownames(data)) { rownames(out) <- NULL } class(out) <- class(data) attr(out, "groups") <- attr(data, "groups") out } ================================================ FILE: R/data_unite.R ================================================ #' @title Unite ("merge") multiple variables #' @name data_unite #' #' @description #' Merge values of multiple variables per observation into one new variable. #' #' @param data A data frame. #' @param new_column The name of the new column, as a string. #' @param separator A character to use between values. #' @param append Logical, if `FALSE` (default), removes original columns that #' were united. If `TRUE`, all columns are preserved and the new column is #' appended to the data frame. #' @param remove_na Logical, if `TRUE`, missing values (`NA`) are not included #' in the united values. If `FALSE`, missing values are represented as `"NA"` #' in the united values. #' @param ... Currently not used. #' @inheritParams extract_column_names #' #' @seealso [`data_separate()`] #' #' @return `data`, with a newly created variable. #' #' @examples #' d <- data.frame( #' x = 1:3, #' y = letters[1:3], #' z = 6:8 #' ) #' d #' data_unite(d, new_column = "xyz") #' data_unite(d, new_column = "xyz", remove = FALSE) #' data_unite(d, new_column = "xyz", select = c("x", "z")) #' data_unite(d, new_column = "xyz", select = c("x", "z"), append = TRUE) #' @export data_unite <- function( data, new_column = NULL, select = NULL, exclude = NULL, separator = "_", append = FALSE, remove_na = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) { # we need a name for the new column if (is.null(new_column)) { insight::format_error( "No name for the new column was provided.", "Please use `new_column` to define a name for the newly created column." ) } # only one column name if (length(new_column) > 1) { insight::format_error( "Please provide only a single string for `new_column`, no character vector with multiple values." ) } # evaluate select/exclude, may be select-helpers select <- .select_nse( select, data, exclude, ignore_case, regex = regex, verbose = verbose ) if (is.null(select) || length(select) <= 1) { insight::format_error( "At least two columns in `select` are required for `data_unite()`." ) } # unite out <- data.frame( new_col = do.call(paste, c(data[select], sep = separator)), stringsAsFactors = FALSE ) colnames(out) <- new_column # remove missings if (remove_na) { # remove trailing and leading "NA_" and "_NA" out[[new_column]] <- gsub(paste0("^NA", separator), "", out[[new_column]]) out[[new_column]] <- gsub(paste0(separator, "NA$"), "", out[[new_column]]) # remove _NA_ inside string, add separator back. This ensure we match # whole-word NA and do not break strings like "COUNTRY_NATION" out[[new_column]] <- gsub( paste0(separator, "NA", separator), separator, out[[new_column]], fixed = TRUE ) } # remove old columns if (!isTRUE(append)) { data[select] <- NULL } # overwrite? if (new_column %in% colnames(data) && verbose) { insight::format_alert( "The name for `new_column` already exists as variable name in the data.", "This variable will be replaced by `new_column`." ) } # overwrite or append data[[new_column]] <- out[[new_column]] # fin data } ================================================ FILE: R/data_write.R ================================================ #' @param data The data frame that should be written to a file. #' @param delimiter For CSV-files, specifies the delimiter. Defaults to `","`, #' but in particular in European regions, `";"` might be a useful alternative, #' especially when exported CSV-files should be opened in Excel. #' @param save_labels Only applies to CSV files. If `TRUE`, value and variable #' labels (if any) will be saved as additional CSV file. This file has the same #' file name as the exported CSV file, but includes a `"_labels"` suffix (i.e. #' when the file name is `"mydat.csv"`, the additional file with value and #' variable labels is named `"mydat_labels.csv"`). #' @rdname data_read #' @export data_write <- function( data, path, delimiter = ",", convert_factors = FALSE, save_labels = FALSE, password = NULL, verbose = TRUE, ... ) { # check file type, so we know the target dta format file_type <- .file_ext(path) type <- switch( file_type, txt = , csv = "csv", sav = , por = "spss", zsav = "zspss", dta = "stata", xpt = "sas", rds = "rds", rda = , rdata = "rda", parquet = "parquet", "unknown" ) # no file type provided? if (!is.character(file_type) || file_type == "") { insight::format_error( "Could not detect file type. The `path` argument has no file extension.", "Please provide a file path including extension, like \"myfile.csv\" or \"c:/Users/Default/myfile.sav\"." ) } if (type %in% c("csv", "unknown")) { .write_csv_or_unknown( data, path, type, delimiter, convert_factors, save_labels, password, verbose, ... ) } else if (type == "rds") { .write_rds(data, path, password, verbose, ...) } else if (type == "rda") { .write_rda(data, path, password, verbose, ...) } else if (type == "parquet") { .write_parquet(data, path, password, verbose, ...) } else { .write_haven(data, path, password, verbose, type, ...) } } # base R formats ----- .write_rds <- function(data, path, password, verbose = TRUE, ...) { # encrypt data data <- .data_encryption(data, password) saveRDS(data, path, ...) } .write_rda <- function(data, path, password, verbose = TRUE, ...) { # encrypt data data <- .data_encryption(data, password) # save single data frame if (is.data.frame(data) || is.raw(data)) { save(data, file = path, ...) } else { # save list of data frames env <- as.environment(data) save(list = names(data), file = path, envir = env, ...) } } # nanoparquet ----- .write_parquet <- function(data, path, password, verbose = TRUE, ...) { # saving raw columns in data frames is not yet supported by parquet, thus, # we cannot save encrypted data right now. if (!is.null(password)) { insight::format_error( "Data encryption is not supported for parquet-files." ) } # requires nanoparquet package insight::check_if_installed("nanoparquet") # save single data frame nanoparquet::write_parquet(x = data, file = path, ...) } # saving into CSV or unknown format ----- .write_csv_or_unknown <- function( data, path, type = "csv", delimiter = ",", convert_factors = FALSE, save_labels = FALSE, password = NULL, verbose = TRUE, ... ) { # data encryption requires a data frame *with attributes* to be saved, which # is not possible for text or raw formats - thus, no encryption here if (!is.null(password)) { insight::format_error( "Data encryption is not supported for CSV or text files." ) } # save labels if (save_labels && type == "csv") { data <- .save_labels_to_file(data, path, delimiter, verbose) } # this might make sense when writing labelled data to CSV if (convert_factors) { data <- .pre_process_exported_data(data, convert_factors) } if (type == "csv") { insight::check_if_installed("readr") if (delimiter == ",") { readr::write_csv(x = data, file = path, ...) } else { readr::write_csv2(x = data, file = path, ...) } } else { insight::check_if_installed("rio") rio::export(x = data, file = path, ...) } } # saving into haven format ----- .write_haven <- function( data, path, password, verbose = TRUE, type = "spss", ... ) { # saving raw columns in data frames in not yet supported by haven, thus, # we cannot save encrypted data right now. if (!is.null(password)) { insight::format_error( "Data encryption is not supported for SPSS, SAS or Stata files." ) } insight::check_if_installed("haven") # check if user provided "compress" argument for SPSS files, # else, default to compression dots <- list(...) if (!is.null(dots$compress)) { compress <- dots$compress } else if (identical(type, "zspss")) { compress <- "zsav" } else { compress <- "byte" } # labelled data needs "labelled" class attributes data <- .set_haven_class_attributes(data, verbose) # fix invalid column names data <- .fix_column_names(data, verbose) if (type %in% c("spss", "zspss")) { # write to SPSS haven::write_sav(data = data, path = path, compress = compress) } else if (type == "stata") { # write to Stata haven::write_dta(data = data, path = path, ...) } else { # write to SAS haven::write_xpt(data = data, path = path, ...) } } # helper ------------------------------- # make sure we have the "labelled" class for labelled data .set_haven_class_attributes <- function(x, verbose = TRUE) { insight::check_if_installed("haven") if (verbose) { insight::format_alert("Preparing data file: converting variable types.") } x[] <- lapply(x, function(i) { # make sure we have labelled class for labelled data value_labels <- attr(i, "labels", exact = TRUE) variable_label <- attr(i, "label", exact = TRUE) # factor requires special preparation to save levels as labels # haven:::vec_cast_named requires "x" and "labels" to be of same type if (is.factor(i)) { haven::labelled( x = as.numeric(i), labels = stats::setNames(seq_along(levels(i)), levels(i)), label = variable_label ) } else if (!is.null(value_labels) || !is.null(variable_label)) { # character requires special preparation to save value labels # haven:::vec_cast_named requires "x" and "labels" to be of same type if (is.character(i)) { # only prepare value labels when these are not NULL if (!is.null(value_labels)) { value_labels <- stats::setNames( as.character(value_labels), names(value_labels) ) } haven::labelled( x = i, labels = value_labels, label = variable_label ) } else { # this should work for the remaining types... haven::labelled(x = i, labels = value_labels, label = variable_label) } } else { # non labelled data can be saved "as is" i } }) x } # packages like SPSS cannot deal with variable which names end with a dot # fix column names here by added a "fix" suffix .fix_column_names <- function(x, verbose = TRUE) { # check for correct column names dot_ends <- vapply(colnames(x), endsWith, FUN.VALUE = TRUE, suffix = ".") if (any(dot_ends)) { if (verbose) { insight::format_alert( "Found and fixed invalid column names so they can be read by other software packages." ) } colnames(x)[dot_ends] <- paste0(colnames(x)[dot_ends], "fix") } x } # save value and variable labels as addtional file .save_labels_to_file <- function(x, path, delimiter, verbose = TRUE) { insight::check_if_installed("readr") # get file path information fpath <- dirname(path) fname <- sub("\\.csv$", "", basename(path)) path <- paste0(fpath, .Platform$file.sep, fname, "_labels.csv") if (verbose) { insight::format_alert( paste0("Saving variable and value labels to \"", path, "\".") ) } # extract labels var_labs <- vapply( x, function(i) { l <- attr(i, "label", exact = TRUE) if (is.null(l)) { l <- "" } l }, character(1) ) # extract value labels value_labs <- vapply( x, function(i) { l <- attr(i, "labels", exact = TRUE) if (is.null(l)) { "" } else { paste0(l, "=", names(l), collapse = "; ") } }, character(1) ) out <- data.frame( variable = colnames(x), label = var_labs, labels = value_labs, stringsAsFactors = FALSE ) if (delimiter == ",") { readr::write_csv(x = out, file = path) } else { readr::write_csv2(x = out, file = path) } } # process data for export, use factor levels as data values ------------------- .pre_process_exported_data <- function(x, convert_factors) { # user may decide whether we automatically detect variable type or not if (isTRUE(convert_factors)) { x[] <- lapply(x, function(i) { # only proceed if not all missing if (!all(is.na(i))) { # save labels value_labels <- attr(i, "labels", exact = TRUE) variable_labels <- attr(i, "label", exact = TRUE) # filter, so only matching value labels remain value_labels <- value_labels[value_labels %in% unique(i)] # guess variable type if (is.character(i)) { # we need this to drop haven-specific class attributes i <- as.character(i) } else if ( !is.null(value_labels) && length(value_labels) == insight::n_unique(i) ) { # if all values are labelled, we assume factor. Use labels as levels if (is.numeric(i)) { i <- factor(i, labels = names(value_labels)) } else { i <- factor(as.character(i), labels = names(value_labels)) } i <- as.character(i) } else { # else, fall back to numeric i <- as.numeric(as.character(i)) } # add back variable label attr(i, "label") <- variable_labels } i }) } else { # drop haven class attributes x[] <- lapply(x, function(i) { # save labels class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr")) i }) } class(x) <- "data.frame" x } # data encryption --------------------------------- .data_encryption <- function(data, password = NULL) { # check if data should be encrypted if (!is.null(password)) { .validate_password(password) data <- .encrypt_data(data, password) # tell user to remember the password insight::format_warning( "Remember your `password`, else you will not be able to decrypt the data again!" ) } data } .encrypt_data <- function(data, password = NULL) { insight::check_if_installed("openssl", "for data encryption") x <- serialize(data, NULL) # it is important to remember the phrase! else, you cannot decrypt the data passphrase <- charToRaw(password) key <- openssl::sha256(passphrase) # encrypt the data. we return the raw data here, which can be handled by # rds/rda/rdata, and users can then also decrypt using openssl directly # datawizard is not necessarily needed for decryption openssl::aes_gcm_encrypt(x, key = key) } .validate_password <- function(password) { # password needs to be a character string if (!is.character(password) || length(password) != 1L || !nzchar(password)) { insight::format_error( "The password must be a single non-empty character string." ) } } ================================================ FILE: R/data_xtabulate.R ================================================ # helper to compute crosstables -------------- .crosstable <- function( x, by, weights = NULL, remove_na = FALSE, proportions = NULL, obj_name = NULL, group_variable = NULL ) { if (!is.null(proportions)) { proportions <- match.arg(proportions, c("row", "column", "full")) } # frequency table if (is.null(weights)) { # we have a `.default` and a `.data.frame` method for `data_tabulate()`. # since this is the default, `x` can be an object which cannot be used # with `table()`, that's why we add `tryCatch()` here. Below we give an # informative error message for non-supported objects. if (remove_na) { x_table <- tryCatch(table(x, by), error = function(e) NULL) } else { x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) } } else if (remove_na) { # weighted frequency table, excluding NA x_table <- tryCatch( stats::xtabs( weights ~ x + by, data = data.frame(weights = weights, x = x, by = by), na.action = stats::na.omit, addNA = FALSE ), error = function(e) NULL ) } else { # weighted frequency table, including NA x_table <- tryCatch( stats::xtabs( weights ~ x + by, data = data.frame(weights = weights, x = addNA(x), by = addNA(by)), na.action = stats::na.pass, addNA = TRUE ), error = function(e) NULL ) } if (is.null(x_table)) { insight::format_warning(paste0( "Can't compute cross tables for objects of class `", class(x)[1], "`." )) return(NULL) } out <- as.data.frame(stats::ftable(x_table)) colnames(out) <- c("Value", "by", "N") total_n <- sum(out$N, na.rm = TRUE) # we want to round N for weighted frequencies if (!is.null(weights)) { out$N <- round(out$N) total_n <- round(total_n) } out <- data_to_wide(out, values_from = "N", names_from = "by") # use variable name as column name if (!is.null(obj_name)) { colnames(out)[1] <- obj_name } # for grouped data frames, add info about grouping variables if (!is.null(group_variable)) { var_info <- toString(lapply(colnames(group_variable), function(i) { sprintf("%s (%s)", i, group_variable[[i]]) })) out <- cbind( out[1], data.frame(Group = var_info, stringsAsFactors = FALSE), out[-1] ) } attr(out, "total_n") <- total_n attr(out, "weights") <- weights attr(out, "proportions") <- proportions attr(out, "varname") <- obj_name attr(out, "grouped_df") <- !is.null(group_variable) attr(out, "prop_table") <- .prop_table(out) class(out) <- c("datawizard_crosstab", "data.frame") out } # Helper function to calculate a table of proportions from a frequency table .prop_table <- function(x) { # Extract the "proportions" attribute, which determines the type of calculation (row, column, or full) props <- attributes(x)$proportions out <- NULL # Proceed only if the "proportions" attribute is set if (!is.null(props)) { # Identify numeric columns, as proportions are only calculated for these numeric_columns <- vapply(x, is.numeric, logical(1)) # Get the total count from the attributes, used for "full" proportions total_n <- attributes(x)$total_n # Use a switch to perform the calculation based on the "props" value out <- switch( props, # Calculate row-wise proportions row = lapply(seq_len(nrow(x)), function(i) { # Sum of the current row's numeric values row_sum <- sum(x[i, numeric_columns], na.rm = TRUE) # Avoid division by zero; if row sum is 0, return a row of zeros if (row_sum == 0) { tmp <- as.data.frame(as.list(rep(0, sum(numeric_columns)))) # for later rbind, we need identical column names colnames(tmp) <- colnames(x)[numeric_columns] tmp } else { x[i, numeric_columns] / row_sum } }), # Calculate column-wise proportions column = lapply(seq_len(ncol(x))[numeric_columns], function(i) { # Sum of the current column's values col_sum <- sum(x[, i], na.rm = TRUE) # Avoid division by zero; if column sum is 0, return a vector of zeros if (col_sum == 0) { rep(0, nrow(x)) } else { x[, i] / col_sum } }), # Calculate proportions relative to the total count of the entire table full = lapply(seq_len(ncol(x))[numeric_columns], function(i) { # Avoid division by zero; if total is 0, return a vector of zeros if (total_n == 0) { rep(0, nrow(x)) } else { x[, i] / total_n } }) ) } # If a proportion table was calculated, format it into a data frame if (!is.null(out)) { # The output of the switch is a list. We need to bind it into a data frame. # For row proportions, we bind rows. For column/full, we bind columns. out <- switch( props, row = as.data.frame(do.call(rbind, out)), as.data.frame(do.call(cbind, out)) ) # Set the column names of the new proportion table colnames(out) <- colnames(x)[numeric_columns] # Check if the dimensions are consistent before setting row names if (nrow(out) == nrow(x)) { # If the first column of the original data is not numeric, it's likely a # label column. Use these labels as row names in the output for better # readability. This is useful for identifying rows, especially when NAs # are present. if (isFALSE(numeric_columns[1])) { r_names <- x[[1]] r_names <- as.character(r_names) # Replace NA in labels with the string "NA", else we cannot set rownames r_names[is.na(r_names)] <- "NA" rownames(out) <- r_names } else { # Otherwise, just use the original row names rownames(out) <- rownames(x) } } } out } # methods --------------------- #' @export format.datawizard_crosstab <- function( x, format = "text", digits = 1, big_mark = NULL, include_total_row = TRUE, ... ) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) # find numeric columns, only for these we need row/column sums numeric_columns <- which(vapply(x, is.numeric, logical(1))) # compute total N for rows and columns total_n <- attributes(x)$total_n total_column <- rowSums(x[numeric_columns], na.rm = TRUE) total_row <- c(colSums(x[numeric_columns], na.rm = TRUE), total_n) # proportions? props <- attributes(x)$proportions prop_table <- attributes(x)$prop_table if (!is.null(props) && !is.null(prop_table)) { for (i in seq_len(ncol(prop_table))) { x[, numeric_columns[i]] <- paste( format(x[, numeric_columns[i]]), format( sprintf("(%.*f%%)", digits, 100 * prop_table[, i]), justify = "right" ) ) } } x[] <- lapply(x, as.character) # format data frame ftab <- insight::format_table(x, ...) # replace empty cells with NA ftab[] <- lapply(ftab, function(i) { i[i == ""] <- ifelse(identical(format, "text"), "", "(NA)") # nolint i }) # Remove ".00" from numbers ftab$Total <- gsub("\\.00$", "", as.character(total_column)) # add final total row to each sub-table. For multiple, collapsed table # (i.e. when length of `by` > 1), we don't want multiple total rows in the # table, so we would set include_total_row = FALSE for objects of class # `datawizard_crosstabs` (note plural s!) if (include_total_row) { # for text format, insert "empty row" before last total row if (identical(format, "text") || identical(format, "markdown")) { empty_row <- as.data.frame(t(data.frame( rep("", ncol(ftab)), c("Total", as.character(total_row)), stringsAsFactors = FALSE ))) } else { empty_row <- as.data.frame(t(data.frame( c("Total", as.character(total_row)), stringsAsFactors = FALSE ))) } colnames(empty_row) <- colnames(ftab) ftab <- rbind(ftab, empty_row) ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ]) } # insert big marks? ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark) ftab[nrow(ftab), ] <- .add_commas_in_numbers(ftab[nrow(ftab), ], big_mark) # also format NA column name colnames(ftab)[colnames(ftab) == "NA"] <- ifelse( identical(format, "text"), "", "(NA)" ) ftab } # print, datawizard_crosstab --------------------- #' @export print.datawizard_crosstab <- function(x, big_mark = NULL, ...) { .print_text_table(x, big_mark, format = "text", ...) invisible(x) } #' @export print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) { .print_text_table(x, big_mark, format = "markdown", ...) } #' @export print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) { .print_text_table(x, big_mark, format = "html", ...) } .print_text_table <- function(x, big_mark = NULL, format = "text", ...) { # grouped data? if yes, add information on grouping factor if (is.null(x[["Group"]])) { caption <- NULL } else { caption <- paste0("Grouped by ", x[["Group"]][1]) x$Group <- NULL } # this function is used by all four supported format, markdown, text, html # and tt (tinytable). For tt, we sometimes have format "html" and backend = "tt", # so we need to check for this special case backend <- switch(format, html = , tt = .check_format_backend(...), format) # prepare table arguments fun_args <- list( format(x, big_mark = big_mark, format = format, ...), caption = caption, format = backend ) if (!format %in% c("html", "tt")) { fun_args$cross <- "+" fun_args$empty_line <- "-" } if (format == "text") { fun_args$missing <- "" } else { fun_args$missing <- "(NA)" } out <- do.call(insight::export_table, c(fun_args, list(...))) # print table if (identical(format, "text")) { cat(out) } else { out } } # print, datawizard_crosstabs --------------------- #' @export print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { .print_text_tables(x, big_mark, format = "text", ...) invisible(x) } #' @export print_md.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { .print_text_tables(x, big_mark, format = "markdown", ...) } #' @export print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { .print_text_tables(x, big_mark, format = "html", ...) } .print_text_tables <- function(x, big_mark = NULL, format = "text", ...) { if (length(x) == 1) { .print_text_table(x[[1]], big_mark = big_mark, format = format, ...) } else { x <- lapply(x, function(i) { # grouped data? if yes, add information on grouping factor if (!is.null(i[["Group"]])) { i$groups <- paste0("Grouped by ", i[["Group"]][1]) i$Group <- NULL } # if we don't have the gt-grouping variable "groups" yet, we use it now # for grouping. Else, we use a new column named "Variable", to avoid # overwriting the groups-variable from grouped data frames if (is.null(i$groups) && identical(format, "html")) { grp_variable <- "groups" } else { grp_variable <- "Variable" } # first variable differs for each data frame, so we harmonize it here i[[grp_variable]] <- colnames(i)[1] colnames(i)[1] <- "Value" # move column to first position i <- data_relocate(i, select = grp_variable, before = 1) # format data frame format( i, format = format, big_mark = big_mark, include_total_row = FALSE, ... ) }) # now bind, but we need to check for equal number of columns if (all(lengths(x) == max(length(x)))) { out <- do.call(rbind, x) } else { # if not all tables have identical columns, we can use "data_merge()", # which safely row-binds all data frames. However, the column order can be # messed up, so we save column order here and restore it later col_order <- colnames(x[[which.max(lengths(x))]]) out <- data_merge(x, join = "bind")[col_order] } # split tables for grouped data frames if (!is.null(out$groups)) { out <- split(out, out$groups) out <- lapply(out, function(subtable) { # for text and markdown, if we split tables, we remove the "groups" # variable. we need to keep it for HTML tables. if (!identical(format, "html")) { attr(subtable, "table_caption") <- c(unique(subtable$groups), "blue") subtable$groups <- NULL } # remove duplicated names for (grpvars in c("Variable", "Group")) { if (!is.null(subtable[[grpvars]])) { subtable[[grpvars]][duplicated(subtable[[grpvars]])] <- "" } } subtable }) # no splitting of grouped data frames into list for HTML format, # because splitting is done by the `by` argument later if (identical(format, "html")) { out <- do.call(rbind, out) } } # prepare table arguments fun_args <- list( out, format = format ) if (format != "html") { fun_args$cross <- "+" fun_args$empty_line <- "-" } else { fun_args$by <- "groups" } if (format == "text") { fun_args$missing <- "" } else { fun_args$missing <- "(NA)" } out <- do.call(insight::export_table, c(fun_args, list(...))) # print table if (identical(format, "text")) { cat(out) } else { out } } } # helper --------------------- .validate_by <- function(by, x) { if (!is.null(by)) { if (is.character(by)) { # If "by" is a character string, must be of length 1 if (length(by) > 1) { insight::format_error( "If `by` is a string indicating a variable name, `by` must be of length 1.", "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`." ) } # if "by" is a character, "x" must be a data frame if (!is.data.frame(x)) { insight::format_error( "If `by` is a string indicating a variable name, `x` must be a data frame." ) } # is "by" a column in "x"? if (!by %in% colnames(x)) { insight::format_error(sprintf( "The variable specified in `by` was not found in `x`. %s", .misspelled_string(names(x), by, "Possibly misspelled?") )) } by <- x[[by]] } # is "by" of same length as "x"? if (is.data.frame(x) && length(by) != nrow(x)) { insight::format_error( "Length of `by` must be equal to number of rows in `x`." ) # nolint } if (!is.data.frame(x) && length(by) != length(x)) { insight::format_error("Length of `by` must be equal to length of `x`.") # nolint } if (!is.factor(by)) { # coerce "by" to factor, including labels by <- to_factor(by, labels_to_levels = TRUE, verbose = FALSE) } } by } .validate_table_weights <- function(weights, x, weights_expression = NULL) { # exception: for vectors, if weighting variable not found, "weights" is NULL. # to check this, we further need to check whether a weights expression was # provided, e.g. "weights = iris$not_found" - all this is only relevant when # weights is NULL if (is.null(weights)) { # possibly misspelled weights-variables for default-method ---------------- # ------------------------------------------------------------------------- # do we have any value for weights_expression? if ( !is.null(weights_expression) && # due to deparse() and substitute, NULL becomes "NULL" - we need to check for this !identical(weights_expression, "NULL") && # we should only run into this problem, when a variable from a data frame # is used in the data_tabulate() method for vectors - thus, we need to check # whether the weights_expression contains a "$" - `iris$not_found` is "NULL" # we need this check, because the default-method of data_tabulate() is called # from the data.frame method, where `weights = weights`, and then, # deparse(substitute(weights)) is "weights" (not "NULL" or "iris$not_found"), # leading to an error when actually all is OK (if "weights" is NULL) # Example: #> efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) # Here, efc$wweight is NULL #> data_tabulate(efc$c172code, weights = efc$wweight) # Here, wweight errors anyway, because object "wweight" is not found #> data_tabulate(efc$c172code, weights = wweight) grepl("$", weights_expression, fixed = TRUE) ) { insight::format_error( "The variable specified in `weights` was not found. Possibly misspelled?" ) } } else { # possibly misspecified weights-variables for data.frame-method ----------- # ------------------------------------------------------------------------- if (is.character(weights)) { # If "weights" is a character string, must be of length 1 if (length(weights) > 1) { insight::format_error( "If `weights` is a string indicating a variable name, `weights` must be of length 1." ) } # if "weights" is a character, "x" must be a data frame if (!is.data.frame(x)) { insight::format_error( "If `weights` is a string indicating a variable name, `x` must be a data frame." ) # nolint } # is "by" a column in "x"? if (!weights %in% colnames(x)) { insight::format_error(sprintf( "The variable specified in `weights` was not found in `x`. %s", .misspelled_string(names(x), weights, "Possibly misspelled?") )) } weights <- x[[weights]] } # is "by" of same length as "x"? if (is.data.frame(x) && length(weights) != nrow(x)) { insight::format_error( "Length of `weights` must be equal to number of rows in `x`." ) # nolint } if (!is.data.frame(x) && length(weights) != length(x)) { insight::format_error( "Length of `weights` must be equal to length of `x`." ) # nolint } } weights } ================================================ FILE: R/datawizard-package.R ================================================ #' `datawizard` #' #' @title datawizard: Easy Data Wrangling and Statistical Transformations #' #' @description #' #' A lightweight package to assist in key steps involved in any data analysis #' workflow: #' #' - wrangling the raw data to get it in the needed form, #' - applying preprocessing steps and statistical transformations, and #' - compute statistical summaries of data properties and distributions. #' #' It is also the data wrangling backend for packages in 'easystats' ecosystem. #' Reference: Patil et al. (2022) \doi{10.21105/joss.04684}. #' #' @docType package #' @aliases datawizard datawizard-package #' @name datawizard-package #' @keywords internal "_PACKAGE" ================================================ FILE: R/demean.R ================================================ #' Compute group-meaned and de-meaned variables #' #' @description #' #' `demean()` computes group- and de-meaned versions of a variable that can be #' used in regression analysis to model the between- and within-subject effect #' (person-mean centering or centering within clusters). `degroup()` is more #' generic in terms of the centering-operation. While `demean()` always uses #' mean-centering, `degroup()` can also use the mode or median for centering. #' #' @param x A data frame. #' @param select Character vector (or formula) with names of variables to select #' that should be group- and de-meaned. #' @param by Character vector (or formula) with the name of the variable that #' indicates the group- or cluster-ID. For cross-classified or nested designs, #' `by` can also identify two or more variables as group- or cluster-IDs. If #' the data is nested and should be treated as such, set `nested = TRUE`. Else, #' if `by` defines two or more variables and `nested = FALSE`, a cross-classified #' design is assumed. Note that `demean()` and `degroup()` can't handle a mix #' of nested and cross-classified designs in one model. #' #' For nested designs, `by` can be: #' - a character vector with the name of the variable that indicates the #' levels, ordered from *highest* level to *lowest* (e.g. #' `by = c("L4", "L3", "L2")`. #' - a character vector with variable names in the format `by = "L4/L3/L2"`, #' where the levels are separated by `/`. #' #' See also section _De-meaning for cross-classified designs_ and #' _De-meaning for nested designs_ below. #' @param nested Logical, if `TRUE`, the data is treated as nested. If `FALSE`, #' the data is treated as cross-classified. Only applies if `by` contains more #' than one variable. #' @param center Method for centering. `demean()` always performs #' mean-centering, while `degroup()` can use `center = "median"` or #' `center = "mode"` for median- or mode-centering, and also `"min"` #' or `"max"`. #' @param suffix_demean,suffix_groupmean String value, will be appended to the #' names of the group-meaned and de-meaned variables of `x`. By default, #' de-meaned variables will be suffixed with `"_within"` and #' grouped-meaned variables with `"_between"`. #' @param append Logical, if `TRUE` (default), the group- and de-meaned #' variables will be appended (column bind) to the original data `x`, #' thus returning both the original and the de-/group-meaned variables. #' @param add_attributes Logical, if `TRUE`, the returned variables gain #' attributes to indicate the within- and between-effects. This is only #' relevant when printing `model_parameters()` - in such cases, the #' within- and between-effects are printed in separated blocks. #' @inheritParams center #' #' @return #' A data frame with the group-/de-meaned variables, which get the suffix #' `"_between"` (for the group-meaned variable) and `"_within"` (for the #' de-meaned variable) by default. For cross-classified or nested designs, #' the name pattern of the group-meaned variables is the name of the centered #' variable followed by the name of the variable that indicates the related #' grouping level, e.g. `predictor_L3_between` and `predictor_L2_between`. #' #' @seealso If grand-mean centering (instead of centering within-clusters) #' is required, see [`center()`]. See [`performance::check_group_variation()`] #' to check for heterogeneity bias. #' #' @section Heterogeneity Bias: #' #' Mixed models include different levels of sources of variability, i.e. #' error terms at each level. When macro-indicators (or level-2 predictors, #' or higher-level units, or more general: *group-level predictors that #' **vary** within and across groups*) are included as fixed effects (i.e. #' treated as covariate at level-1), the variance that is left unaccounted for #' this covariate will be absorbed into the error terms of level-1 and level-2 #' (_Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6._): #' "Such covariates contain two parts: one that is specific to the higher-level #' entity that does not vary between occasions, and one that represents the #' difference between occasions, within higher-level entities" (_Bell et al. 2015_). #' Hence, the error terms will be correlated with the covariate, which violates #' one of the assumptions of mixed models (iid, independent and identically #' distributed error terms). This bias is also called the *heterogeneity bias* #' (_Bell et al. 2015_). To resolve this problem, level-2 predictors used as #' (level-1) covariates should be separated into their "within" and "between" #' effects by "de-meaning" and "group-meaning": After demeaning time-varying #' predictors, "at the higher level, the mean term is no longer constrained by #' Level 1 effects, so it is free to account for all the higher-level variance #' associated with that variable" (_Bell et al. 2015_). #' #' @section Panel data and correlating fixed and group effects: #' #' `demean()` is intended to create group- and de-meaned variables for panel #' regression models (fixed effects models), or for complex #' random-effect-within-between models (see _Bell et al. 2015, 2018_), where #' group-effects (random effects) and fixed effects correlate (see #' _Bafumi and Gelman 2006_). This can happen, for instance, when analyzing #' panel data, which can lead to *Heterogeneity Bias*. To control for correlating #' predictors and group effects, it is recommended to include the group-meaned #' and de-meaned version of *time-varying covariates* (and group-meaned version #' of *time-invariant covariates* that are on a higher level, e.g. level-2 #' predictors) in the model. By this, one can fit complex multilevel models for #' panel data, including time-varying predictors, time-invariant predictors and #' random effects. #' #' @section Why mixed models are preferred over fixed effects models: #' #' A mixed models approach can model the causes of endogeneity explicitly #' by including the (separated) within- and between-effects of time-varying #' fixed effects and including time-constant fixed effects. Furthermore, #' mixed models also include random effects, thus a mixed models approach #' is superior to classic fixed-effects models, which lack information of #' variation in the group-effects or between-subject effects. Furthermore, #' fixed effects regression cannot include random slopes, which means that #' fixed effects regressions are neglecting "cross-cluster differences in the #' effects of lower-level controls (which) reduces the precision of estimated #' context effects, resulting in unnecessarily wide confidence intervals and #' low statistical power" (_Heisig et al. 2017_). #' #' @section Terminology: #' #' The group-meaned variable is simply the mean of an independent variable #' within each group (or id-level or cluster) represented by `by`. It represents #' the cluster-mean of an independent variable. The regression coefficient of a #' group-meaned variable is the *between-subject-effect*. The de-meaned variable #' is then the centered version of the group-meaned variable. De-meaning is #' sometimes also called person-mean centering or centering within clusters. #' The regression coefficient of a de-meaned variable represents the #' *within-subject-effect*. #' #' @section De-meaning with continuous predictors: #' #' For continuous time-varying predictors, the recommendation is to include #' both their de-meaned and group-meaned versions as fixed effects, but not #' the raw (untransformed) time-varying predictors themselves. The de-meaned #' predictor should also be included as random effect (random slope). In #' regression models, the coefficient of the de-meaned predictors indicates #' the within-subject effect, while the coefficient of the group-meaned #' predictor indicates the between-subject effect. #' #' @section De-meaning with binary predictors: #' #' For binary time-varying predictors, there are two recommendations. First #' is to include the raw (untransformed) binary predictor as fixed effect #' only and the *de-meaned* variable as random effect (random slope). #' The alternative would be to add the de-meaned version(s) of binary #' time-varying covariates as additional fixed effect as well (instead of #' adding it as random slope). Centering time-varying binary variables to #' obtain within-effects (level 1) isn't necessary. They have a sensible #' interpretation when left in the typical 0/1 format (_Hoffmann 2015, #' chapter 8-2.I_). `demean()` will thus coerce categorical time-varying #' predictors to numeric to compute the de- and group-meaned versions for #' these variables, where the raw (untransformed) binary predictor and the #' de-meaned version should be added to the model. #' #' @section De-meaning of factors with more than 2 levels: #' #' Factors with more than two levels are demeaned in two ways: first, these #' are also converted to numeric and de-meaned; second, dummy variables #' are created (binary, with 0/1 coding for each level) and these binary #' dummy-variables are de-meaned in the same way (as described above). #' Packages like **panelr** internally convert factors to dummies before #' demeaning, so this behaviour can be mimicked here. #' #' @section De-meaning interaction terms: #' #' There are multiple ways to deal with interaction terms of within- and #' between-effects. #' #' - A classical approach is to simply use the product term of the de-meaned #' variables (i.e. introducing the de-meaned variables as interaction term #' in the model formula, e.g. `y ~ x_within * time_within`). This approach, #' however, might be subject to bias (see _Giesselmann & Schmidt-Catran 2020_). #' #' - Another option is to first calculate the product term and then apply the #' de-meaning to it. This approach produces an estimator "that reflects #' unit-level differences of interacted variables whose moderators vary #' within units", which is desirable if *no* within interaction of #' two time-dependent variables is required. This is what `demean()` does #' internally when `select` contains interaction terms. #' #' - A third option, when the interaction should result in a genuine within #' estimator, is to "double de-mean" the interaction terms #' (_Giesselmann & Schmidt-Catran 2018_), however, this is currently #' not supported by `demean()`. If this is required, the `wmb()` #' function from the **panelr** package should be used. #' #' To de-mean interaction terms for within-between models, simply specify #' the term as interaction for the `select`-argument, e.g. `select = "a*b"` #' (see 'Examples'). #' #' @section De-meaning for cross-classified designs: #' #' `demean()` can handle cross-classified designs, where the data has two or #' more groups at the higher (i.e. second) level. In such cases, the #' `by`-argument can identify two or more variables that represent the #' cross-classified group- or cluster-IDs. The de-meaned variables for #' cross-classified designs are simply subtracting all group means from each #' individual value, i.e. _fully cluster-mean-centering_ (see _Guo et al. 2024_ #' for details). Note that de-meaning for cross-classified designs is *not* #' equivalent to de-meaning of nested data structures from models with three or #' more levels. Set `nested = TRUE` to explicitly assume a nested design. For #' cross-classified designs, de-meaning is supposed to work for models like #' `y ~ x + (1|level3) + (1|level2)`, but *not* for models like #' `y ~ x + (1|level3/level2)`. Note that `demean()` and `degroup()` can't #' handle a mix of nested and cross-classified designs in one model. #' #' @section De-meaning for nested designs: #' #' _Brincks et al. (2017)_ have suggested an algorithm to center variables for #' nested designs, which is implemented in `demean()`. For nested designs, set #' `nested = TRUE` *and* specify the variables that indicate the different #' levels in descending order in the `by` argument. E.g., #' `by = c("level4", "level3, "level2")` assumes a model like #' `y ~ x + (1|level4/level3/level2)`. An alternative notation for the #' `by`-argument would be `by = "level4/level3/level2"`, similar to the #' formula notation. #' #' @section Analysing panel data with mixed models using lme4: #' #' A description of how to translate the formulas described in *Bell et al. 2018* #' into R using `lmer()` from **lme4** can be found in #' [this vignette](https://easystats.github.io/parameters/articles/demean.html). #' #' @references #' #' - Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors #' and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the #' American Political Science Association. #' #' - Bell A, Fairbrother M, Jones K. 2019. Fixed and Random Effects #' Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074 #' #' - Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects #' Modeling of Time-Series Cross-Sectional and Panel Data. Political Science #' Research and Methods, 3(1), 133–153. #' #' - Brincks, A. M., Enders, C. K., Llabre, M. M., Bulotsky-Shearer, R. J., #' Prado, G., and Feaster, D. J. (2017). Centering Predictor Variables in #' Three-Level Contextual Models. Multivariate Behavioral Research, 52(2), #' 149–163. https://doi.org/10.1080/00273171.2016.1256753 #' #' - Gelman A, Hill J. 2007. Data Analysis Using Regression and #' Multilevel/Hierarchical Models. Analytical Methods for Social Research. #' Cambridge, New York: Cambridge University Press #' #' - Giesselmann M, Schmidt-Catran, AW. 2020. Interactions in fixed #' effects regression models. Sociological Methods & Research, 1–28. #' https://doi.org/10.1177/0049124120914934 #' #' - Guo Y, Dhaliwal J, Rights JD. 2024. Disaggregating level-specific effects #' in cross-classified multilevel models. Behavior Research Methods, 56(4), #' 3023–3057. #' #' - Heisig JP, Schaeffer M, Giesecke J. 2017. The Costs of Simplicity: #' Why Multilevel Models May Benefit from Accounting for Cross-Cluster #' Differences in the Effects of Controls. American Sociological Review 82 #' (4): 796–827. #' #' - Hoffman L. 2015. Longitudinal analysis: modeling within-person #' fluctuation and change. New York: Routledge #' #' @examples #' #' data(iris) #' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID #' iris$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable #' #' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID") #' head(x) #' #' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID") #' head(x) #' #' #' # demean interaction term x*y #' dat <- data.frame( #' a = c(1, 2, 3, 4, 1, 2, 3, 4), #' x = c(4, 3, 3, 4, 1, 2, 1, 2), #' y = c(1, 2, 1, 2, 4, 3, 2, 1), #' ID = c(1, 2, 3, 1, 2, 3, 1, 2) #' ) #' demean(dat, select = c("a", "x*y"), by = "ID") #' #' # or in formula-notation #' demean(dat, select = ~ a + x * y, by = ~ID) #' #' @export demean <- function( x, select, by, nested = FALSE, suffix_demean = "_within", suffix_groupmean = "_between", append = TRUE, add_attributes = TRUE, verbose = TRUE ) { degroup( x = x, select = select, by = by, nested = nested, center = "mean", suffix_demean = suffix_demean, suffix_groupmean = suffix_groupmean, append = append, add_attributes = add_attributes, verbose = verbose ) } #' @rdname demean #' @export degroup <- function( x, select, by, nested = FALSE, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", append = TRUE, add_attributes = TRUE, verbose = TRUE ) { # ugly tibbles again... but save original data frame original_data <- x x <- .coerce_to_dataframe(x) center <- match.arg( tolower(center), choices = c("mean", "median", "mode", "min", "max") ) if (inherits(select, "formula")) { # formula to character, remove "~", split at "+". We don't use `all.vars()` # here because we want to keep the interaction terms as they are select <- trimws(unlist( strsplit( gsub("~", "", insight::safe_deparse(select), fixed = TRUE), "+", fixed = TRUE ), use.names = FALSE )) } # handle different "by" options if (inherits(by, "formula")) { by <- all.vars(by) } # we also allow lme4-syntax here: if by = "L4/L3/L2", we assume a nested design if (length(by) == 1 && grepl("/", by, fixed = TRUE)) { by <- insight::trim_ws(unlist( strsplit(by, "/", fixed = TRUE), use.names = FALSE )) nested <- TRUE } # identify interaction terms interactions_no <- select[!grepl("(\\*|\\:)", select)] interactions_yes <- select[grepl("(\\*|\\:)", select)] # if we have interaction terms that should be de-meaned, calculate the product # of the terms first, then demean the product if (length(interactions_yes)) { interaction_terms <- lapply( strsplit(interactions_yes, "*", fixed = TRUE), trimws ) product <- lapply(interaction_terms, function(i) do.call(`*`, x[, i])) new_dat <- as.data.frame(stats::setNames( product, gsub("\\s", "", gsub("*", "_", interactions_yes, fixed = TRUE)) )) x <- cbind(x, new_dat) select <- c(interactions_no, colnames(new_dat)) } # check if all variables are present not_found <- setdiff(c(select, by), colnames(x)) if (length(not_found)) { insight::format_error( paste0( "Variable", ifelse(length(not_found) > 1, "s ", " "), text_concatenate(not_found, enclose = "\""), ifelse(length(not_found) > 1, " were", " was"), " not found in the dataset." ), .misspelled_string( colnames(x), not_found, "Possibly misspelled or not yet defined?" ) ) } # get data to demean... dat <- x[, c(select, by)] # find categorical predictors that are coded as factors categorical_predictors <- vapply( dat[select], is.factor, FUN.VALUE = logical(1L) ) # convert binary predictors to numeric if (any(categorical_predictors)) { # convert categorical to numeric, and then demean dat[select[categorical_predictors]] <- lapply( dat[select[categorical_predictors]], function(i) as.numeric(i) - 1 ) # convert categorical to dummy, and demean each binary dummy for (i in select[categorical_predictors]) { if (nlevels(x[[i]]) > 2) { for (j in levels(x[[i]])) { # create vector with zeros f <- rep(0, nrow(x)) # for each matching level, set dummy to 1 f[x[[i]] == j] <- 1 dummy <- data.frame(f) # colnames = variable name + factor level # also add new dummy variables to "select" colnames(dummy) <- sprintf("%s_%s", i, j) select <- c(select, sprintf("%s_%s", i, j)) # add to data dat <- cbind(dat, dummy) } } } # tell user... if (isTRUE(verbose)) { insight::format_alert( paste0( "Categorical predictors (", toString(names(categorical_predictors)[categorical_predictors]), ") have been coerced to numeric values to compute de- and group-meaned variables.\n" ) ) } } # group variables, then calculate the mean-value # for variables within each group (the group means). assign # mean values to a vector of same length as the data gm_fun <- switch( center, mode = function(.gm) distribution_mode(stats::na.omit(.gm)), median = function(.gm) stats::median(.gm, na.rm = TRUE), min = function(.gm) min(.gm, na.rm = TRUE), max = function(.gm) max(.gm, na.rm = TRUE), function(.gm) mean(.gm, na.rm = TRUE) ) # we allow disaggregating level-specific effects for cross-classified multilevel # models (see Guo et al. 2024). Two levels should work as proposed by the authors, # more levels also already work, but need to check the formula from the paper # and validate results if (length(by) == 1) { # simple case: one level group_means_list <- lapply(select, function(i) { stats::ave(dat[[i]], dat[[by]], FUN = gm_fun) }) names(group_means_list) <- select # create de-meaned variables by subtracting the group mean from each individual value person_means_list <- lapply(select, function(i) { dat[[i]] - group_means_list[[i]] }) } else if (nested) { # nested design: by > 1, nested is explicitly set to TRUE # We want: # L3_between = xbar(k) # L2_between = xbar(j,k) - xbar(k) # L1_within = x(ijk) - xbar(jk) # , where # x(ijk) is the individual value / variable that is measured on level 1 # xbar(k) <- ave(x_ijk, L3, FUN = mean), the group mean of the variable at highest level # xbar(jk) <- ave(x_ijk, L3, L2, FUN = mean), the group mean of the variable at second level group_means_list <- lapply(select, function(i) { out <- lapply(seq_along(by), function(k) { stats::ave(dat[[i]], dat[, by[1:k], drop = FALSE], FUN = gm_fun) }) # subtract mean of higher level from lower level for (j in 2:length(by)) { out[[j]] <- out[[j]] - out[[j - 1]] } names(out) <- paste0(i, "_", by) out }) # create de-meaned variables by subtracting the group mean from each individual value person_means_list <- lapply( # seq_along(select), # function(i) dat[[select[i]]] - group_means_list[[i]][[length(by)]] select, function(i) { dat[[i]] - stats::ave(dat[[i]], dat[, by, drop = FALSE], FUN = gm_fun) } ) } else { # cross-classified design: by > 1 group_means_list <- lapply(by, function(j) { out <- lapply(select, function(i) { stats::ave(dat[[i]], dat[[j]], FUN = gm_fun) }) names(out) <- paste0(select, "_", j) out }) group_means_list <- unlist(group_means_list, recursive = FALSE) # de-meaned variables for cross-classified design is simply subtracting # all group means from each individual value person_means_list <- lapply(select, function(i) { sum_group_means <- Reduce("+", group_means_list[paste0(i, "_", by)]) dat[[i]] - sum_group_means }) } # preserve names names(person_means_list) <- select # convert to data frame and add suffix to column names group_means <- as.data.frame(group_means_list) person_means <- as.data.frame(person_means_list) colnames(person_means) <- sprintf( "%s%s", colnames(person_means), suffix_demean ) colnames(group_means) <- sprintf( "%s%s", colnames(group_means), suffix_groupmean ) if (isTRUE(add_attributes)) { person_means[] <- lapply(person_means, function(i) { attr(i, "within-effect") <- TRUE i }) group_means[] <- lapply(group_means, function(i) { attr(i, "between-effect") <- TRUE i }) } # between and within effects out <- cbind(group_means, person_means) # append to original data? if (isTRUE(append)) { # check for unique column names duplicated_columns <- intersect(colnames(out), colnames(original_data)) if (length(duplicated_columns)) { insight::format_error(paste0( "One or more of the centered variables already exist in the orignal data frame: ", # nolint text_concatenate(duplicated_columns, enclose = "`"), ". Please rename the affected variable(s) in the original data, or use the arguments `suffix_demean` and `suffix_groupmean` to rename the centered variables." # nolint )) } out <- cbind(original_data, out) } out } #' @rdname demean #' @export detrend <- degroup ================================================ FILE: R/describe_distribution.R ================================================ #' Describe a distribution #' #' This function describes a distribution by a set of indices (e.g., measures of #' centrality, dispersion, range, skewness, (excess) kurtosis). #' #' @param x A numeric vector, a character vector, a data frame, or a list. See #' `Details`. #' @param by Column names indicating how to split the data in various groups #' before describing the distribution. `by` groups will be added to potentially #' existing groups created by `data_group()`. #' @param range Return the range (min and max). #' @param quartiles Return the first and third quartiles (25th and 75th #' percentiles). #' @param include_factors Logical, if `TRUE`, factors are included in the #' output, however, only columns for range (first and last factor levels) as #' well as n and missing will contain information. #' @param ci Confidence Interval (CI) level. Default is `NULL`, i.e. no #' confidence intervals are computed. If not `NULL`, confidence intervals are #' based on bootstrap replicates (see `iterations`). #' @param iterations The number of bootstrap replicates for computing confidence #' intervals. Only applies when `ci` is not `NULL`. Defaults to `100`. For #' more stable results, increase the number of `iterations`, but note that this #' can also increase the computation time significantly. #' @param iqr Logical, if `TRUE`, the interquartile range is calculated (based #' on [stats::IQR()], using `type = 6`). #' @param verbose Show or silence warnings and messages. #' @inheritParams bayestestR::point_estimate #' @inheritParams extract_column_names #' #' @details If `x` is a data frame, only numeric variables are kept and will be #' displayed in the summary by default. #' #' If `x` is a list, the behavior is different whether `x` is a stored list. If #' `x` is stored (for example, `describe_distribution(mylist)` where `mylist` #' was created before), artificial variable names are used in the summary #' (`Var_1`, `Var_2`, etc.). If `x` is an unstored list (for example, #' `describe_distribution(list(mtcars$mpg))`), then `"mtcars$mpg"` is used as #' variable name. #' #' @note There is also a #' [`plot()`-method](https://easystats.github.io/see/articles/parameters.html) #' implemented in the [**see**-package](https://easystats.github.io/see/). #' #' @seealso [kurtosis()] to compute kurtosis (recognized as excess kurtosis). #' #' @return A data frame with columns that describe the properties of the variables. #' #' @examplesIf require("bayestestR", quietly = TRUE) #' describe_distribution(rnorm(100)) #' #' data(iris) #' describe_distribution(iris) #' describe_distribution(iris, include_factors = TRUE, quartiles = TRUE) #' describe_distribution(list(mtcars$mpg, mtcars$cyl)) #' @export describe_distribution <- function(x, ...) { UseMethod("describe_distribution") } #' @export describe_distribution.default <- function(x, verbose = TRUE, ...) { if (verbose) { insight::format_warning( paste0("Can't describe variables of class `", class(x)[1], "`.") ) } NULL } #' @export describe_distribution.list <- function( x, centrality = "mean", dispersion = TRUE, iqr = TRUE, range = TRUE, quartiles = FALSE, ci = NULL, include_factors = FALSE, iterations = 100, threshold = 0.1, verbose = TRUE, ... ) { factor_el <- which(vapply(x, is.factor, FUN.VALUE = logical(1L))) num_el <- which(vapply(x, is.numeric, FUN.VALUE = logical(1L))) # get elements names as is # ex: `list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl")` nm <- vapply( sys.call()[[2]], insight::safe_deparse, FUN.VALUE = character(1L) )[-1] if (isTRUE(include_factors)) { x <- x[c(num_el, factor_el)] if (length(nm) != 0) { nm <- nm[c(num_el, factor_el)] } } else { x <- x[num_el] if (length(nm) != 0) { nm <- nm[num_el] } } # Not possible to obtain elements names if they are stored in # an object if (length(nm) == 0) { nm <- paste0("Var_", seq_along(x)) } # The function currently doesn't support descriptive summaries for character # or factor types. out <- do.call( rbind, lapply(x, function(i) { if ( (include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i)) ) { describe_distribution( i, centrality = centrality, dispersion = dispersion, iqr = iqr, range = range, quartiles = quartiles, ci = ci, iterations = iterations, threshold = threshold, verbose = verbose ) } }) ) if (is.null(names(x))) { new_names <- nm } else { empty_names <- which(!nzchar(names(x), keepNA = TRUE)) new_names <- names(x) new_names[empty_names] <- nm[empty_names] } out$Variable <- new_names row.names(out) <- NULL out <- out[c("Variable", setdiff(colnames(out), "Variable"))] class(out) <- unique(c( "parameters_distribution", "see_parameters_distribution", class(out) )) attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500) attr(out, "ci") <- ci attr(out, "centrality") <- centrality attr(out, "threshold") <- threshold out } #' @rdname describe_distribution #' @export describe_distribution.numeric <- function( x, centrality = "mean", dispersion = TRUE, iqr = TRUE, range = TRUE, quartiles = FALSE, ci = NULL, iterations = 100, threshold = 0.1, verbose = TRUE, ... ) { insight::check_if_installed("bayestestR") out <- data.frame(.temp = 0) # Missing n_missing <- sum(is.na(x)) x <- stats::na.omit(x) # Point estimates out <- cbind( out, bayestestR::point_estimate( x, centrality = centrality, dispersion = dispersion, threshold = threshold, verbose = verbose, ... ) ) # interquartile range, type same as minitab and SPSS if (iqr) { out$IQR <- stats::IQR(x, na.rm = TRUE, type = 6) } # Confidence Intervals if (!is.null(ci)) { insight::check_if_installed("boot") # tell user about bootstrapping and appropriate number of iterations. # "show_iterations_msg" is an undocumented argument that is only passed # internally to this function to avoid multiple repeated messages if (!isFALSE(list(...)$show_iterations_msg)) { .show_iterations_warning(verbose, iterations, ci) } # calculate CI for each centrality for (cntr in .centrality_options(centrality)) { results <- tryCatch( { boot::boot( data = x, statistic = .boot_distribution, R = iterations, centrality = cntr ) }, error = function(e) { msg <- conditionMessage(e) if (!is.null(msg) && msg == "sample is too sparse to find TD") { insight::format_warning( "When bootstrapping CIs, sample was too sparse to find TD. Returning NA for CIs." ) list(t = c(NA_real_, NA_real_)) } } ) out_ci <- bayestestR::ci(results$t, ci = ci, verbose = FALSE) ci_data <- data.frame(out_ci$CI_low[1], out_ci$CI_high[1]) colnames(ci_data) <- c(paste0("CI_low_", cntr), paste0("CI_high_", cntr)) out <- cbind(out, ci_data) } } # Range if (range) { out <- cbind( out, data.frame( Min = min(x, na.rm = TRUE), Max = max(x, na.rm = TRUE) ) ) } # Quartiles if (quartiles) { out <- cbind( out, data.frame( Q1 = stats::quantile(x, probs = 0.25, na.rm = TRUE), Q3 = stats::quantile(x, probs = 0.75, na.rm = TRUE) ) ) } # Skewness out <- cbind( out, data.frame( Skewness = as.numeric(skewness(x, verbose = verbose)), Kurtosis = as.numeric(kurtosis(x, verbose = verbose)) ) ) out$n <- length(x) out$n_Missing <- n_missing out$.temp <- NULL class(out) <- unique(c( "parameters_distribution", "see_parameters_distribution", class(out) )) attr(out, "data") <- x attr(out, "ci") <- ci attr(out, "centrality") <- centrality attr(out, "threshold") <- threshold out } #' @rdname describe_distribution #' @export describe_distribution.factor <- function( x, dispersion = TRUE, range = TRUE, verbose = TRUE, ... ) { # Missing n_missing <- sum(is.na(x)) x <- stats::na.omit(x) out <- data.frame( Mean = NA, SD = NA, CI_low = NA, CI_high = NA, IQR = NA, Min = levels(x)[1], Max = levels(x)[nlevels(x)], Q1 = NA, Q3 = NA, Skewness = as.numeric(skewness(x, verbose = verbose)), Kurtosis = as.numeric(kurtosis(x, verbose = verbose)), n = length(x), n_Missing = n_missing, stringsAsFactors = FALSE ) if (!dispersion) { out$SD <- NULL } dot_args <- list(...) if (is.null(dot_args[["ci"]])) { out$CI_low <- NULL out$CI_high <- NULL } if (is.null(dot_args[["iqr"]]) || isFALSE(dot_args[["iqr"]])) { out$IQR <- NULL } if (is.null(dot_args[["quartiles"]]) || isFALSE(dot_args[["quartiles"]])) { out$Q1 <- NULL out$Q3 <- NULL } if (!range) { out$Min <- NULL out$Max <- NULL } class(out) <- unique(c( "parameters_distribution", "see_parameters_distribution", class(out) )) attr(out, "data") <- x out } #' @export describe_distribution.character <- function( x, dispersion = TRUE, range = TRUE, verbose = TRUE, ... ) { # Missing n_missing <- sum(is.na(x)) x <- stats::na.omit(x) values <- unique(x) out <- data.frame( Mean = NA, SD = NA, IQR = NA, CI_low = NA, CI_high = NA, Min = values[1], Max = values[length(values)], Q1 = NA, Q3 = NA, Skewness = as.numeric(skewness(x, verbose = verbose)), Kurtosis = as.numeric(kurtosis(x, verbose = verbose)), n = length(x), n_Missing = n_missing, stringsAsFactors = FALSE ) if (!dispersion) { out$SD <- NULL } dot_args <- list(...) if (is.null(dot_args[["ci"]])) { out$CI_low <- NULL out$CI_high <- NULL } if (is.null(dot_args[["iqr"]]) || isFALSE(dot_args[["iqr"]])) { out$IQR <- NULL } if (is.null(dot_args[["quartiles"]]) || isFALSE(dot_args[["quartiles"]])) { out$Q1 <- NULL out$Q3 <- NULL } if (!range) { out$Min <- NULL out$Max <- NULL } class(out) <- unique(c( "parameters_distribution", "see_parameters_distribution", class(out) )) attr(out, "data") <- x out } #' @rdname describe_distribution #' @export describe_distribution.data.frame <- function( x, select = NULL, exclude = NULL, centrality = "mean", dispersion = TRUE, iqr = TRUE, range = TRUE, quartiles = FALSE, include_factors = FALSE, ci = NULL, iterations = 100, threshold = 0.1, ignore_case = FALSE, regex = FALSE, verbose = TRUE, by = NULL, ... ) { select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # check for reserved variable names .check_for_reserved_names(select) # tell user about bootstrapping and appropriate number of iterations .show_iterations_warning(verbose, iterations, ci) if (!is.null(by)) { if (!is.character(by)) { insight::format_error("`by` must be a character vector.") } x <- data_group(x, by) out <- describe_distribution( x, select = select, exclude = exclude, centrality = centrality, dispersion = dispersion, iqr = iqr, range = range, quartiles = quartiles, include_factors = include_factors, ci = ci, iterations = iterations, threshold = threshold, ignore_case = ignore_case, regex = regex, verbose = verbose ) out <- data_ungroup(out) return(out) } # The function currently doesn't support descriptive summaries for character # or factor types. out <- do.call( rbind, lapply(x[select], function(i) { if ( (include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i)) ) { describe_distribution( i, centrality = centrality, dispersion = dispersion, iqr = iqr, range = range, quartiles = quartiles, ci = ci, iterations = iterations, threshold = threshold, verbose = verbose, show_iterations_msg = FALSE ) } }) ) if (is.null(out)) { return(NULL) } out$Variable <- row.names(out) row.names(out) <- NULL out <- out[c("Variable", setdiff(colnames(out), "Variable"))] class(out) <- unique(c( "parameters_distribution", "see_parameters_distribution", class(out) )) attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500) attr(out, "ci") <- ci attr(out, "centrality") <- centrality attr(out, "threshold") <- threshold out } #' @export describe_distribution.grouped_df <- function( x, select = NULL, exclude = NULL, centrality = "mean", dispersion = TRUE, iqr = TRUE, range = TRUE, quartiles = FALSE, include_factors = FALSE, ci = NULL, iterations = 100, threshold = 0.1, ignore_case = FALSE, regex = FALSE, verbose = TRUE, by = NULL, ... ) { if (!is.null(by)) { if (!is.character(by)) { insight::format_error("`by` must be a character vector.") } existing_grps <- setdiff(colnames(attributes(x)$groups), ".rows") x <- data_group(x, c(existing_grps, by)) } group_vars <- setdiff(colnames(attributes(x)$groups), ".rows") group_data <- expand.grid(lapply(x[group_vars], function(i) unique(sort(i)))) groups <- split(x, x[group_vars]) groups <- Filter(function(x) nrow(x) > 0, groups) # check for reserved variable names .check_for_reserved_names(group_vars, type = "group_vars") select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # tell user about bootstrapping and appropriate number of iterations .show_iterations_warning(verbose, iterations, ci) out <- do.call( rbind, lapply(seq_along(groups), function(i) { d <- describe_distribution.data.frame( groups[[i]][select], centrality = centrality, dispersion = dispersion, iqr = iqr, range = range, quartiles = quartiles, include_factors = include_factors, ci = ci, iterations = iterations, threshold = threshold, verbose = verbose, show_iterations_msg = FALSE, ... ) for (grp in seq_along(group_vars)) { d[[group_vars[grp]]] <- group_data[i, grp] } d <- data_relocate(d, group_vars, before = 1) d }) ) class(out) <- unique(c( "parameters_distribution", "see_parameters_distribution", class(out) )) attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500) attr(out, "ci") <- ci attr(out, "centrality") <- centrality attr(out, "threshold") <- threshold out } # methods ------------------ #' @export print.parameters_distribution <- function(x, digits = 2, ...) { formatted_table <- format( x, digits = digits, format = "text", ci_width = NULL, ci_brackets = TRUE, ... ) cat(insight::export_table( formatted_table, format = "text", digits = digits, ... )) invisible(x) } #' @export print_md.parameters_distribution <- function( x, digits = 2, ci_brackets = c("(", ")"), ... ) { formatted_table <- format( x = x, digits = digits, format = "markdown", ci_width = NULL, ci_brackets = ci_brackets, ... ) insight::export_table( formatted_table, format = "markdown", align = "firstleft", ... ) } #' @export print_html.parameters_distribution <- function( x, digits = 2, ci_brackets = c("(", ")"), ... ) { formatted_table <- format( x = x, digits = digits, format = "html", ci_width = NULL, ci_brackets = ci_brackets, ... ) # determine backend backend <- .check_format_backend(...) # pass arguments to export_table fun_args <- list( formatted_table, format = backend, ... ) # no "align" for format "tt" - this currently gives an error. Not sure # if related to insight::export_table or tinytable if (identical(backend, "html")) { fun_args$align <- "firstleft" } do.call(insight::export_table, fun_args) } #' @export display.parameters_distribution <- function( object, format = "markdown", digits = 2, ... ) { format <- .display_default_format(format) fun_args <- list( x = object, digits = digits, ... ) # print table in HTML or markdown format if (format %in% c("html", "tt")) { fun_args$backend <- format do.call(print_html, fun_args) } else { do.call(print_md, fun_args) } } #' @export plot.parameters_distribution <- function(x, ...) { insight::check_if_installed("see") NextMethod() } # bootstrapping CIs ---------------------------------- .boot_distribution <- function(data, indices, centrality) { out <- datawizard::describe_distribution( data[indices], centrality = centrality, dispersion = FALSE, iqr = FALSE, range = FALSE, quartiles = FALSE, ci = NULL, verbose = FALSE ) out[[1]] } # check centrality options ---------------------------------------- .centrality_options <- function(centrality) { if (identical(centrality, "all")) { c("mean", "median", "MAP") } else { centrality } } # sanity check ---------------------------------------- .check_for_reserved_names <- function(x, type = "select") { reserved_names <- c( "Variable", "CI_low", "CI_high", "n_Missing", "Q1", "Q3", "Quartiles", "Min", "Max", "Range", "Trimmed_Mean", "Trimmed", "Mean", "SD", "IQR", "Skewness", "Kurtosis", "n", "Median", "MAD", "MAP", "IQR", "n_Missing" ) invalid_names <- intersect(reserved_names, x) if (length(invalid_names) > 0) { # adapt message to show user whether wrong variables appear in grouping or select msg <- switch( type, select = "with `describe_distribution()`: ", "as grouping variables in `describe_distribution()`: " ) insight::format_error(paste0( "Following variable names are reserved and cannot be used ", msg, text_concatenate(invalid_names, enclose = "`"), ". Please rename these variables in your data." )) } } .show_iterations_warning <- function(verbose, iterations = 100, ci = NULL) { if (verbose && !is.null(ci)) { msg <- paste( "Bootstrapping confidence intervals using", iterations, "iterations, please be patient..." ) if (iterations < 200) { msg <- c( msg, "For more stable intervals, increase the number of `iterations`, but note that this can also increase the computation time significantly." ) # nolint } insight::format_alert(msg) } } ================================================ FILE: R/descriptives.R ================================================ # distribution_mode ---------------------------------- #' Compute mode for a statistical distribution #' #' @param x An atomic vector, a list, or a data frame. #' #' @return #' #' The value that appears most frequently in the provided data. #' The returned data structure will be the same as the entered one. #' #' @seealso For continuous variables, the #' **Highest Maximum a Posteriori probability estimate (MAP)** may be #' a more useful way to estimate the most commonly-observed value #' than the mode. See [bayestestR::map_estimate()]. #' #' @examples #' #' distribution_mode(c(1, 2, 3, 3, 4, 5)) #' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)) #' #' @export distribution_mode <- function(x) { # TODO: Add support for weights, trim, binned (method) uniqv <- unique(x) tab <- tabulate(match(x, uniqv)) idx <- which.max(tab) uniqv[idx] } #' Compute the coefficient of variation #' #' Compute the coefficient of variation (CV, ratio of the standard deviation to #' the mean, \eqn{\sigma/\mu}) for a set of numeric values. #' #' @return The computed coefficient of variation for `x`. #' @export #' #' @examples #' coef_var(1:10) #' coef_var(c(1:10, 100), method = "median_mad") #' coef_var(c(1:10, 100), method = "qcd") #' coef_var(mu = 10, sigma = 20) #' coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30) coef_var <- function(x, ...) { UseMethod("coef_var") } #' @name distribution_cv #' @rdname coef_var #' @export distribution_coef_var <- coef_var #' @export coef_var.default <- function(x, verbose = TRUE, ...) { if (verbose) { insight::format_warning( paste0( "Can't compute the coefficient of variation objects of class `", class(x)[1], "`." ) ) } NULL } #' @param x A numeric vector of ratio scale (see details), or vector of values than can be coerced to one. #' @param mu A numeric vector of mean values to use to compute the coefficient #' of variation. If supplied, `x` is not used to compute the mean. #' @param sigma A numeric vector of standard deviation values to use to compute the coefficient #' of variation. If supplied, `x` is not used to compute the SD. #' @param method Method to use to compute the CV. Can be `"standard"` to compute #' by dividing the standard deviation by the mean, `"unbiased"` for the #' unbiased estimator for normally distributed data, or one of two robust #' alternatives: `"median_mad"` to divide the median by the [stats::mad()], #' or `"qcd"` (quartile coefficient of dispersion, interquartile range divided #' by the sum of the quartiles \[twice the midhinge\]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}. #' @param trim the fraction (0 to 0.5) of values to be trimmed from #' each end of `x` before the mean and standard deviation (or other measures) #' are computed. Values of `trim` outside the range of (0 to 0.5) are taken #' as the nearest endpoint. #' @param remove_na Logical. Should `NA` values be removed before computing (`TRUE`) #' or not (`FALSE`, default)? #' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not #' computed from `x`), what sample size to use to adjust the computed CV #' for small-sample bias? #' @param ... Further arguments passed to computation functions. #' #' @details #' CV is only applicable of values taken on a ratio scale: values that have a #' *fixed* meaningfully defined 0 (which is either the lowest or highest #' possible value), and that ratios between them are interpretable For example, #' how many sandwiches have I eaten this week? 0 means "none" and 20 sandwiches #' is 4 times more than 5 sandwiches. If I were to center the number of #' sandwiches, it will no longer be on a ratio scale (0 is no "none" it is the #' mean, and the ratio between 4 and -2 is not meaningful). Scaling a ratio #' scale still results in a ratio scale. So I can re define "how many half #' sandwiches did I eat this week ( = sandwiches * 0.5) and 0 would still mean #' "none", and 20 half-sandwiches is still 4 times more than 5 half-sandwiches. #' #' This means that CV is **NOT** invariant to shifting, but it is to scaling: #' ```{r} #' sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7) #' coef_var(sandwiches) #' #' coef_var(sandwiches / 2) # same #' #' coef_var(sandwiches + 4) # different! 0 is no longer meaningful! #' ```` #' #' @rdname coef_var #' #' @export coef_var.numeric <- function( x, mu = NULL, sigma = NULL, method = c("standard", "unbiased", "median_mad", "qcd"), trim = 0, remove_na = FALSE, n = NULL, ... ) { # TODO: Support weights if (!missing(x) && all(c(-1, 1) %in% sign(x))) { insight::format_error( "Coefficient of variation only applicable for ratio scale variables." ) } method <- match.arg( method, choices = c("standard", "unbiased", "median_mad", "qcd") ) if (is.null(mu) || is.null(sigma)) { if (isTRUE(remove_na)) { x <- .drop_na(x) } n <- length(x) x <- .trim_values(x, trim = trim, n = n) } if (is.null(mu)) { mu <- switch( method, standard = , unbiased = mean(x, ...), median_mad = stats::median(x, ...), qcd = unname(sum(stats::quantile(x, probs = c(0.25, 0.75), ...))) ) } if (is.null(sigma)) { sigma <- switch( method, standard = , unbiased = stats::sd(x, ...), median_mad = stats::mad(x, center = mu, ...), qcd = unname(diff(stats::quantile(x, probs = c(0.25, 0.75), ...))) ) } out <- sigma / mu if (method == "unbiased") { if (is.null(n)) { insight::format_error( "A value for `n` must be provided when `method = \"unbiased\"` and both `mu` and `sigma` are provided." ) } # from DescTools::CoefVar out <- out * (1 - 1 / (4 * (n - 1)) + 1 / n * out^2 + 1 / (2 * (n - 1)^2)) } out } # descriptives helpers .drop_na <- function(x) { x[!is.na(x)] } .trim_values <- function(x, trim = 0, n = NULL, weights = NULL) { # TODO: Support weights if (!is.numeric(trim) || length(trim) != 1L) { insight::format_error("`trim` must be a single numeric value.") } if (is.null(n)) { n <- length(x) } if (trim > 0 && n) { if (anyNA(x)) { return(NA_real_) } if (trim >= 0.5) { return(stats::median(x, na.rm = FALSE)) } lo <- floor(n * trim) + 1 hi <- n + 1 - lo x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi] } x } ================================================ FILE: R/extract_column_names.R ================================================ #' @title Find or get columns in a data frame based on search patterns #' @name extract_column_names #' #' @description `extract_column_names()` returns column names from a data set that #' match a certain search pattern, while `data_select()` returns the found data. #' #' @param data A data frame. #' @param select Variables that will be included when performing the required #' tasks. Can be either #' #' - a variable specified as a literal variable name (e.g., `column_name`), #' - a string with the variable name (e.g., `"column_name"`), a character #' vector of variable names (e.g., `c("col1", "col2", "col3")`), or a #' character vector of variable names including ranges specified via `:` #' (e.g., `c("col1:col3", "col5")`), #' - for some functions, like `data_select()` or `data_rename()`, `select` can #' be a named character vector. In this case, the names are used to rename #' the columns in the output data frame. See 'Details' in the related #' functions to see where this option applies. #' - a formula with variable names (e.g., `~column_1 + column_2`), #' - a vector of positive integers, giving the positions counting from the left #' (e.g. `1` or `c(1, 3, 5)`), #' - a vector of negative integers, giving the positions counting from the #' right (e.g., `-1` or `-1:-3`), #' - one of the following select-helpers: `starts_with()`, `ends_with()`, #' `contains()`, a range using `:`, or `regex()`. `starts_with()`, #' `ends_with()`, and `contains()` accept several patterns, e.g #' `starts_with("Sep", "Petal")`. `regex()` can be used to define regular #' expression patterns. #' - a function testing for logical conditions, e.g. `is.numeric()` (or #' `is.numeric`), or any user-defined function that selects the variables #' for which the function returns `TRUE` (like: `foo <- function(x) mean(x) > 3`), #' - ranges specified via literal variable names, select-helpers (except #' `regex()`) and (user-defined) functions can be negated, i.e. return #' non-matching elements, when prefixed with a `-`, e.g. `-ends_with()`, #' `-is.numeric` or `-(Sepal.Width:Petal.Length)`. **Note:** Negation means #' that matches are _excluded_, and thus, the `exclude` argument can be #' used alternatively. For instance, `select=-ends_with("Length")` (with #' `-`) is equivalent to `exclude=ends_with("Length")` (no `-`). In case #' negation should not work as expected, use the `exclude` argument instead. #' #' If `NULL`, selects all columns. Patterns that found no matches are silently #' ignored, e.g. `extract_column_names(iris, select = c("Species", "Test"))` #' will just return `"Species"`. #' @param exclude See `select`, however, column names matched by the pattern #' from `exclude` will be excluded instead of selected. If `NULL` (the default), #' excludes no columns. #' @param ignore_case Logical, if `TRUE` and when one of the select-helpers or #' a regular expression is used in `select`, ignores lower/upper case in the #' search pattern when matching against variable names. #' @param regex Logical, if `TRUE`, the search pattern from `select` will be #' treated as regular expression. When `regex = TRUE`, select *must* be a #' character string (or a variable containing a character string) and is not #' allowed to be one of the supported select-helpers or a character vector #' of length > 1. `regex = TRUE` is comparable to using one of the two #' select-helpers, `select = contains()` or `select = regex()`, however, #' since the select-helpers may not work when called from inside other #' functions (see 'Details'), this argument may be used as workaround. #' @param verbose Toggle warnings. #' @param ... Arguments passed down to other functions. Mostly not used yet. #' #' @inherit data_rename seealso #' #' @return #' #' `extract_column_names()` returns a character vector with column names that #' matched the pattern in `select` and `exclude`, or `NULL` if no matching #' column name was found. `data_select()` returns a data frame with matching #' columns. #' #' @details #' #' Specifically for `data_select()`, `select` can also be a named character #' vector. In this case, the names are used to rename the columns in the #' output data frame. See 'Examples'. #' #' Note that it is possible to either pass an entire select helper or only the #' pattern inside a select helper as a function argument: #' #' ```r #' foo <- function(data, pattern) { #' extract_column_names(data, select = starts_with(pattern)) #' } #' foo(iris, pattern = "Sep") #' #' foo2 <- function(data, pattern) { #' extract_column_names(data, select = pattern) #' } #' foo2(iris, pattern = starts_with("Sep")) #' ``` #' #' This means that it is also possible to use loop values as arguments or patterns: #' #' ```r #' for (i in c("Sepal", "Sp")) { #' head(iris) |> #' extract_column_names(select = starts_with(i)) |> #' print() #' } #' ``` #' #' However, this behavior is limited to a "single-level function". It will not #' work in nested functions, like below: #' #' ```r #' inner <- function(data, arg) { #' extract_column_names(data, select = arg) #' } #' outer <- function(data, arg) { #' inner(data, starts_with(arg)) #' } #' outer(iris, "Sep") #' ``` #' #' In this case, it is better to pass the whole select helper as the argument of #' `outer()`: #' #' ```r #' outer <- function(data, arg) { #' inner(data, arg) #' } #' outer(iris, starts_with("Sep")) #' ``` #' #' @examples #' # Find column names by pattern #' extract_column_names(iris, starts_with("Sepal")) #' extract_column_names(iris, ends_with("Width")) #' extract_column_names(iris, regex("\\.")) #' extract_column_names(iris, c("Petal.Width", "Sepal.Length")) #' #' # starts with "Sepal", but not allowed to end with "width" #' extract_column_names(iris, starts_with("Sepal"), exclude = contains("Width")) #' #' # find numeric with mean > 3.5 #' numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5 #' extract_column_names(iris, numeric_mean_35) #' #' # find column names, using range #' extract_column_names(mtcars, c(cyl:hp, wt)) #' #' # find range of column names by range, using character vector #' extract_column_names(mtcars, c("cyl:hp", "wt")) #' #' # rename returned columns for "data_select()" #' head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl"))) #' @export extract_column_names <- function( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { columns <- .select_nse( select, data, exclude, ignore_case = ignore_case, regex = regex, verbose = FALSE ) if (!length(columns) || is.null(columns)) { columns <- NULL if (isTRUE(verbose)) { insight::format_warning( "No column names that matched the required search pattern were found." ) } } columns } #' @rdname extract_column_names #' @export find_columns <- extract_column_names ================================================ FILE: R/format.R ================================================ # distribution --------------------------------- #' @export format.parameters_distribution <- function( x, digits = 2, format = NULL, ci_width = "auto", ci_brackets = TRUE, ... ) { # save information att <- attributes(x) if (all(c("Min", "Max") %in% names(x))) { x$Min <- insight::format_ci( x$Min, x$Max, ci = NULL, digits = digits, width = ci_width, brackets = ci_brackets ) x$Max <- NULL colnames(x)[which(colnames(x) == "Min")] <- "Range" } if (all(c("Q1", "Q3") %in% names(x))) { x$Q1 <- insight::format_ci( x$Q1, x$Q3, ci = NULL, digits = digits, width = ci_width, brackets = FALSE ) x$Q3 <- NULL colnames(x)[which(colnames(x) == "Q1")] <- "Quartiles" } # find CI columns. We might have multiple columns for different centralities ci_columns <- grepl("^(CI_low|CI_high)", colnames(x)) # make sure we have matches if (any(ci_columns)) { # iterate all centrality options centrality <- .centrality_options(att$centrality) for (ce in centrality) { # this is the original column name ci_columns <- c(paste0("CI_low_", ce), paste0("CI_high_", ce)) # we format CI column, merge it into one column x[[ci_columns[1]]] <- insight::format_ci( x[[ci_columns[1]]], x[[ci_columns[2]]], ci = NULL, digits = digits, width = ci_width, brackets = ci_brackets ) # ... and remove the no longer needed CI_high column x[[ci_columns[2]]] <- NULL ci_lvl <- attributes(x)$ci # find position of CI column ci_columm_pos <- which(colnames(x) == ci_columns[1]) # rename if (is.null(ci_lvl)) { colnames(x)[ci_columm_pos] <- sprintf( "CI (%s)", insight::format_capitalize(ce) ) } else { colnames(x)[ci_columm_pos] <- sprintf( "%i%% CI (%s)", round(100 * ci_lvl), insight::format_capitalize(ce) ) } # make sure we have the correct column name of the centrality centr_name <- switch( tolower(ce), mean = "Mean", median = "Median", map = "MAP" ) # reorder CI column, move it to related centrality index centr_pos <- which(colnames(x) == centr_name) if (length(centr_pos)) { x <- data_relocate(x, select = ci_columm_pos, after = centr_pos) } } } if ("Trimmed_Mean" %in% colnames(x)) { threshold <- attributes(x)$threshold if (is.null(threshold)) { trim_name <- "Trimmed" } else { trim_name <- sprintf("Trimmed (%g%%)", round(100 * threshold)) } colnames(x)[which(colnames(x) == "Trimmed_Mean")] <- trim_name } x } ================================================ FILE: R/labels_to_levels.R ================================================ #' @title Convert value labels into factor levels #' @name labels_to_levels #' #' @details #' `labels_to_levels()` allows to use value labels of factors as their levels. #' #' @param x A data frame or factor. Other variable types (e.g. numerics) are not #' allowed. #' @param ... Currently not used. #' @inheritParams extract_column_names #' @inheritParams categorize #' #' @return `x`, where for all factors former levels are replaced by their value #' labels. #' #' @examples #' data(efc) #' # create factor #' x <- as.factor(efc$c172code) #' # add value labels - these are not factor levels yet #' x <- assign_labels(x, values = c(`1` = "low", `2` = "mid", `3` = "high")) #' levels(x) #' data_tabulate(x) #' #' x <- labels_to_levels(x) #' levels(x) #' data_tabulate(x) #' @export labels_to_levels <- function(x, ...) { UseMethod("labels_to_levels") } #' @export labels_to_levels.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert("`labels_to_levels()` only works for factors.") } x } #' @rdname labels_to_levels #' @export labels_to_levels.factor <- function(x, verbose = TRUE, ...) { if (is.null(attr(x, "labels", exact = TRUE))) { insight::format_error( "Could not change factor levels. Variable had no value labels." ) } .value_labels_to_levels(x, verbose = verbose) } #' @rdname labels_to_levels #' @export labels_to_levels.data.frame <- function( x, select = NULL, exclude = NULL, ignore_case = FALSE, append = FALSE, regex = FALSE, verbose = TRUE, ... ) { # validation check, return as is for complete factor if (all(vapply(x, is.factor, TRUE))) { return(x) } # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # keep only factors select <- colnames(x[select])[vapply(x[select], is.factor, TRUE)] # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments arguments <- .process_append( x, select, append, append_suffix = "_l", preserve_value_labels = TRUE, keep_factors = TRUE, keep_character = FALSE ) # update processed arguments x <- arguments$x select <- arguments$select } x[select] <- lapply( x[select], labels_to_levels, verbose = verbose, ... ) x } ================================================ FILE: R/makepredictcall.R ================================================ #' Utility Function for Safe Prediction with `datawizard` transformers #' #' This function allows for the use of (some of) `datawizard`'s transformers #' inside a model formula. See examples below. #' \cr\cr #' Currently, [center()], [standardize()], [normalize()], & [rescale()] are #' supported. #' #' @inheritParams stats::makepredictcall #' #' @inherit stats::makepredictcall return #' @importFrom stats makepredictcall #' #' @seealso [stats::makepredictcall()] #' @family datawizard-transformers #' #' @examples #' #' data("mtcars") #' train <- mtcars[1:30, ] #' test <- mtcars[31:32, ] #' #' m1 <- lm(mpg ~ center(hp), data = train) #' predict(m1, newdata = test) # Data is "centered" before the prediction is made, #' # according to the center of the old data #' #' m2 <- lm(mpg ~ standardize(hp), data = train) #' m3 <- lm(mpg ~ scale(hp), data = train) # same as above #' predict(m2, newdata = test) # Data is "standardized" before the prediction is made. #' predict(m3, newdata = test) # Data is "standardized" before the prediction is made. #' #' #' m4 <- lm(mpg ~ normalize(hp), data = mtcars) #' m5 <- lm(mpg ~ rescale(hp, to = c(-3, 3)), data = mtcars) #' #' (newdata <- data.frame(hp = c(range(mtcars$hp), 400))) # 400 is outside original range! #' #' model.frame(delete.response(terms(m4)), data = newdata) #' model.frame(delete.response(terms(m5)), data = newdata) #' #' @export makepredictcall.dw_transformer <- function(var, call) { if (is.matrix(var) || is.array(var)) { insight::format_error( "datawizard scalers in model formulas are not supported for matrices." ) } switch( as.character(call)[1L], centre = , center = { call$center <- attr(var, "center") }, standardise = , standardize = { call$center <- attr(var, "center") call$scale <- attr(var, "scale") }, normalize = , normalise = { call$min_value <- attr(var, "min_value") call$range_difference <- attr(var, "range_difference") call$vector_length <- attr(var, "vector_length") call$include_bounds <- attr(var, "include_bounds") call$flag_bounds <- attr(var, "flag_bounds") }, rescale = { call$min_value <- attr(var, "min_value") call$max_value <- attr(var, "max_value") call$new_min <- attr(var, "new_min") call$new_max <- attr(var, "new_max") }, # ELSE: { return(call) } ) call$verbose <- FALSE call } ================================================ FILE: R/mean_sd.R ================================================ #' Summary Helpers #' #' @param x A numeric vector (or one that can be coerced to one via #' `as.numeric()`) to be summarized. #' @param named Should the vector be named? #' (E.g., `c("-SD" = -1, Mean = 1, "+SD" = 2)`.) #' @param times How many SDs above and below the Mean (or MADs around the Median) #' @param ... Not used. #' @inheritParams coef_var #' @inheritParams stats::mad #' #' @return A (possibly named) numeric vector of length `2*times + 1` of SDs #' below the mean, the mean, and SDs above the mean (or median and MAD). #' #' @examples #' mean_sd(mtcars$mpg) #' #' mean_sd(mtcars$mpg, times = 2L) #' #' median_mad(mtcars$mpg) #' #' @export mean_sd <- function(x, times = 1L, remove_na = TRUE, named = TRUE, ...) { .centrality_dispersion( x, type = "mean", times = times, remove_na = remove_na, named = named ) } #' @export #' @rdname mean_sd median_mad <- function( x, times = 1L, remove_na = TRUE, constant = 1.4826, named = TRUE, ... ) { .centrality_dispersion( x, type = "median", times = times, remove_na = remove_na, constant = constant, named = named ) } #' @keywords Internal .centrality_dispersion <- function( x, type = "mean", remove_na = TRUE, times = 1L, constant = 1.4826, named = TRUE, ... ) { x <- as.numeric(x) times <- as.integer(times) type <- match.arg(type, choices = c("mean", "median")) # centrality M <- switch( type, median = stats::median(x, na.rm = remove_na), mean(x, na.rm = remove_na) ) S <- switch( type, median = stats::mad(x, na.rm = remove_na, constant = constant), stats::sd(x, na.rm = remove_na) ) v <- M + c(-rev(seq_len(times)), 0, seq_len(times)) * S if (isTRUE(named)) { string_cs <- switch(type, median = c("Median", "MAD"), c("Mean", "SD")) if (times == 1L) { times <- "" } else { times <- paste0(seq_len(times), " ") } names(v) <- c( paste0("-", rev(times), string_cs[2]), string_cs[1], paste0("+", times, string_cs[2]) ) } v } ================================================ FILE: R/means_by_group.R ================================================ #' @title Summary of mean values by group #' @name means_by_group #' #' @description Computes summary table of means by groups. #' #' @param x A vector or a data frame. #' @param by If `x` is a numeric vector, `by` should be a factor that #' indicates the group-classifying categories. If `x` is a data frame, `by` #' should be a character string, naming the variable in `x` that is used for #' grouping. Numeric vectors are coerced to factors. Not that `by` should #' only refer to a single variable. #' @param ci Level of confidence interval for mean estimates. Default is `0.95`. #' Use `ci = NA` to suppress confidence intervals. #' @param weights If `x` is a numeric vector, `weights` should be a vector of #' weights that will be applied to weight all observations. If `x` is a data #' frame, `weights` can also be a character string indicating the name of the #' variable in `x` that should be used for weighting. Default is `NULL`, so no #' weights are used. #' @param digits Optional scalar, indicating the amount of digits after decimal #' point when rounding estimates and values. #' @param ... Currently not used #' @inheritParams find_columns #' #' @return A data frame with information on mean and further summary statistics #' for each sub-group. #' #' @details This function is comparable to `aggregate(x, by, mean)`, but provides #' some further information, including summary statistics from a One-Way-ANOVA #' using `x` as dependent and `by` as independent variable. [`emmeans::contrast()`] #' is used to get p-values for each sub-group. P-values indicate whether each #' group-mean is significantly different from the total mean. #' #' @examples #' data(efc) #' means_by_group(efc, "c12hour", "e42dep") #' #' data(iris) #' means_by_group(iris, "Sepal.Width", "Species") #' #' # weighting #' efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) #' means_by_group(efc, "c12hour", "e42dep", weights = "weight") #' @export means_by_group <- function(x, ...) { UseMethod("means_by_group") } #' @export means_by_group.default <- function(x, ...) { insight::format_error( "`means_by_group()` does not work for objects of class `", class(x)[1], "`." ) } #' @rdname means_by_group #' @export means_by_group.numeric <- function( x, by = NULL, ci = 0.95, weights = NULL, digits = NULL, ... ) { # validation check for arguments # "by" must be provided if (is.null(by)) { insight::format_error("Argument `by` is missing.") } # by must be of same length as x if (length(by) != length(x)) { insight::format_error("Argument `by` must be of same length as `x`.") } # if weights are provided, must be of same length as x if (!is.null(weights) && length(weights) != length(x)) { insight::format_error("Argument `weights` must be of same length as `x`.") } # if weights are NULL, set weights to 1 if (is.null(weights)) { weights <- rep(1, length(x)) } # retrieve labels var_mean_label <- attr(x, "label", exact = TRUE) var_grp_label <- attr(by, "label", exact = TRUE) # if no labels present, use variable names directly if (is.null(var_mean_label)) { var_mean_label <- deparse(substitute(x)) } if (is.null(var_grp_label)) { var_grp_label <- deparse(substitute(by)) } # coerce group to factor if numeric, or convert labels to levels, if factor if (is.factor(by)) { by <- tryCatch(labels_to_levels(by, verbose = FALSE), error = function(e) { by }) } else { by <- to_factor(by) } my_data <- stats::na.omit(data.frame( x = x, group = by, weights = weights, stringsAsFactors = FALSE )) # get grouped means table out <- .means_by_group(my_data, ci = ci) # attributes attr(out, "var_mean_label") <- var_mean_label attr(out, "var_grp_label") <- var_grp_label attr(out, "digits") <- digits class(out) <- c("dw_groupmeans", "data.frame") out } #' @rdname means_by_group #' @export means_by_group.data.frame <- function( x, select = NULL, by = NULL, ci = 0.95, weights = NULL, digits = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) if (is.null(weights)) { w <- NULL } else if (is.character(weights)) { w <- x[[weights]] } else { w <- weights } out <- lapply(select, function(i) { # if no labels present, use variable names directy if (is.null(attr(x[[i]], "label", exact = TRUE))) { attr(x[[i]], "label") <- i } if (is.null(attr(x[[by]], "label", exact = TRUE))) { attr(x[[by]], "label") <- by } # compute means table means_by_group( x[[i]], by = x[[by]], ci = ci, weights = w, digits = digits, ... ) }) class(out) <- c("dw_groupmeans_list", "list") out } #' @keywords internal .means_by_group <- function(data, ci = 0.95) { # compute anova statistics for mean table if (is.null(data$weights) || all(data$weights == 1)) { fit <- stats::lm(x ~ group, data = data) } else { fit <- stats::lm(x ~ group, weights = data$weights, data = data) } # summary table data groups <- split(data$x, data$group) group_weights <- split(data$weights, data$group) out <- do.call( rbind, Map( function(x, w) { data.frame( Mean = weighted_mean(x, weights = w), SD = weighted_sd(x, weights = w), N = round(sum(w)), stringsAsFactors = FALSE ) }, groups, group_weights ) ) # add group names out$Category <- levels(data$group) out$p <- out$CI_high <- out$CI_low <- NA # p-values of contrast-means if (insight::check_if_installed("emmeans", quietly = TRUE)) { # create summary table of contrasts, for p-values and confidence intervals predicted <- emmeans::emmeans(fit, specs = "group", level = ci) emm_contrasts <- emmeans::contrast(predicted, method = "eff") # add p-values and confidence intervals to "out" if (!is.null(ci) && !is.na(ci)) { summary_table <- as.data.frame(predicted) out$CI_low <- summary_table$lower.CL out$CI_high <- summary_table$upper.CL } summary_table <- as.data.frame(emm_contrasts) out$p <- summary_table$p.value } # reorder columns out <- out[c("Category", "Mean", "N", "SD", "CI_low", "CI_high", "p")] # finally, add total-row out <- rbind( out, data.frame( Category = "Total", Mean = weighted_mean(data$x, weights = data$weights), N = nrow(data), SD = weighted_sd(data$x, weights = data$weights), CI_low = NA, CI_high = NA, p = NA, stringsAsFactors = FALSE ) ) # get anova statistics for mean table sum.fit <- summary(fit) # r-squared values r2 <- sum.fit$r.squared r2.adj <- sum.fit$adj.r.squared # F-statistics fstat <- sum.fit$fstatistic pval <- stats::pf(fstat[1], fstat[2], fstat[3], lower.tail = FALSE) # copy as attributes attr(out, "r2") <- r2 attr(out, "ci") <- ci attr(out, "adj.r2") <- r2.adj attr(out, "fstat") <- fstat[1] attr(out, "p.value") <- pval out } # methods ----------------- #' @export format.dw_groupmeans <- function(x, digits = NULL, ...) { if (is.null(digits)) { digits <- attr(x, "digits", exact = TRUE) } if (is.null(digits)) { digits <- 2 } x$N <- insight::format_value(x$N, digits = 0) insight::format_table(remove_empty_columns(x), digits = digits, ...) } #' @export print.dw_groupmeans <- function(x, digits = NULL, ...) { out <- format(x, digits = digits, ...) # caption l1 <- attributes(x)$var_mean_label l2 <- attributes(x)$var_grp_label if (!is.null(l1) && !is.null(l2)) { caption <- c(paste0("# Mean of ", l1, " by ", l2), "blue") } else { caption <- NULL } # footer footer <- paste0( "\nAnova: R2=", insight::format_value(attributes(x)$r2, digits = 3), "; adj.R2=", insight::format_value(attributes(x)$adj.r2, digits = 3), "; F=", insight::format_value(attributes(x)$fstat, digits = 3), "; ", insight::format_p(attributes(x)$p.value, whitespace = FALSE), "\n" ) cat(insight::export_table(out, caption = caption, footer = footer, ...)) } #' @export print.dw_groupmeans_list <- function(x, digits = NULL, ...) { for (i in seq_along(x)) { if (i > 1) { cat("\n") } print(x[[i]], digits = digits, ...) } } ================================================ FILE: R/normalize.R ================================================ #' Normalize numeric variable to 0-1 range #' #' Performs a normalization of data, i.e., it scales variables in the range #' 0 - 1. This is a special case of [rescale()]. `unnormalize()` is the #' counterpart, but only works for variables that have been normalized with #' `normalize()`. #' #' @param x A numeric vector, (grouped) data frame, or matrix. See 'Details'. #' @param include_bounds Numeric or logical. Using this can be useful in case of #' beta-regression, where the response variable is not allowed to include #' zeros and ones. If `TRUE`, the input is normalized to a range that includes #' zero and one. If `FALSE`, the return value is compressed, using #' Smithson and Verkuilen's (2006) formula `(x * (n - 1) + 0.5) / n`, to avoid #' zeros and ones in the normalized variables. Else, if numeric (e.g., `0.001`), #' `include_bounds` defines the "distance" to the lower and upper bound, i.e. #' the normalized vectors are rescaled to a range from `0 + include_bounds` to #' `1 - include_bounds`. #' @param ... Arguments passed to or from other methods. #' @inheritParams standardize.data.frame #' @inheritParams extract_column_names #' #' @inheritSection center Selection of variables - the `select` argument #' #' @details #' #' - If `x` is a matrix, normalization is performed across all values (not #' column- or row-wise). For column-wise normalization, convert the matrix to a #' data.frame. #' - If `x` is a grouped data frame (`grouped_df`), normalization is performed #' separately for each group. #' #' @seealso See [makepredictcall.dw_transformer()] for use in model formulas. #' #' @examples #' #' normalize(c(0, 1, 5, -5, -2)) #' normalize(c(0, 1, 5, -5, -2), include_bounds = FALSE) #' # use a value defining the bounds #' normalize(c(0, 1, 5, -5, -2), include_bounds = 0.001) #' #' head(normalize(trees)) #' #' @references #' #' Smithson M, Verkuilen J (2006). A Better Lemon Squeezer? Maximum-Likelihood #' Regression with Beta-Distributed Dependent Variables. Psychological Methods, #' 11(1), 54–71. #' #' @family transform utilities #' #' @return A normalized object. #' #' @export normalize <- function(x, ...) { UseMethod("normalize") } #' @rdname normalize #' @export normalize.numeric <- function(x, include_bounds = TRUE, verbose = TRUE, ...) { # Warning if all NaNs or infinite if (all(is.infinite(x) | is.na(x))) { return(x) } # safe name, for later use if (is.null(names(x))) { name <- insight::safe_deparse(substitute(x)) } else { name <- names(x) } # Get infinite and replace by NA (so that the normalization doesn't fail) infinite_idx <- is.infinite(x) infinite_vals <- x[infinite_idx] x[infinite_idx] <- NA # called from "makepredictcal()"? Then we have additional arguments dot_args <- list(...) flag_predict <- FALSE required_dot_args <- c( "range_difference", "min_value", "vector_length", "flag_bounds" ) if (all(required_dot_args %in% names(dot_args))) { # we gather informatiom about the original data, which is needed # for "predict()" to work properly when "normalize()" is called # in formulas on-the-fly, e.g. "lm(mpg ~ normalize(hp), data = mtcars)" range_difference <- dot_args$range_difference min_value <- dot_args$min_value vector_length <- dot_args$vector_length flag_bounds <- dot_args$flag_bounds flag_predict <- TRUE } else { range_difference <- diff(range(x, na.rm = TRUE)) min_value <- min(x, na.rm = TRUE) vector_length <- length(x) flag_bounds <- NULL } # Warning if only one value if (!flag_predict && insight::has_single_value(x)) { if (verbose) { insight::format_warning( paste0( "Variable `", name, "` contains only one unique value and will not be normalized." ) ) } return(x) } # Warning if logical vector if (insight::n_unique(x) == 2 && verbose) { insight::format_warning( paste0( "Variable `", name, "` contains only two unique values. Consider converting it to a factor." ) ) } # rescale out <- as.vector((x - min_value) / range_difference) # if we don't have information on whether bounds are included or not, # get this information here. if (is.null(flag_bounds)) { flag_bounds <- (any(out == 0) || any(out == 1)) } if (!isTRUE(include_bounds) && flag_bounds) { if (isFALSE(include_bounds)) { out <- (out * (vector_length - 1) + 0.5) / vector_length } else if ( is.numeric(include_bounds) && include_bounds > 0 && include_bounds < 1 ) { out <- rescale(out, to = c(0 + include_bounds, 1 - include_bounds)) } else if (verbose) { insight::format_warning( "`include_bounds` must be either logical or numeric (between 0 and 1).", "Bounds (zeros and ones) are included in the returned value." ) } } # Re-insert infinite values out[infinite_idx] <- infinite_vals attr(out, "include_bounds") <- include_bounds attr(out, "flag_bounds") <- isTRUE(flag_bounds) attr(out, "min_value") <- min_value attr(out, "vector_length") <- vector_length attr(out, "range_difference") <- range_difference # don't add attribute when we call data frame methods if (!isFALSE(dot_args$add_transform_class)) { class(out) <- c("dw_transformer", class(out)) } out } #' @export normalize.factor <- function(x, ...) { x } #' @export normalize.grouped_df <- function( x, select = NULL, exclude = NULL, include_bounds = TRUE, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, remove_group_var = TRUE, verbose = verbose ) info <- attributes(x) grps <- attr(x, "groups", exact = TRUE)[[".rows"]] # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments my_args <- .process_append( x, select, append, append_suffix = "_n" ) # update processed arguments x <- my_args$x select <- my_args$select } x <- as.data.frame(x) # create column(s) to store dw_transformer attributes for (i in select) { info$groups[[paste0("attr_", i)]] <- rep(NA, length(grps)) } for (rows in seq_along(grps)) { tmp <- normalize( x[grps[[rows]], , drop = FALSE], select = select, exclude = exclude, include_bounds = include_bounds, verbose = verbose, append = FALSE, # need to set to FALSE here, else variable will be doubled add_transform_class = FALSE, ... ) # store dw_transformer_attributes for (i in select) { info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[ i ]]))) } x[grps[[rows]], ] <- tmp } # last column of "groups" attributes must be called ".rows" info$groups <- data_relocate(info$groups, ".rows", after = -1) # set back class, so data frame still works with dplyr attributes(x) <- utils::modifyList(info, attributes(x)) class(x) <- c("grouped_df", class(x)) x } #' @rdname normalize #' @export normalize.data.frame <- function( x, select = NULL, exclude = NULL, include_bounds = TRUE, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments my_args <- .process_append( x, select, append, append_suffix = "_n" ) # update processed arguments x <- my_args$x select <- my_args$select } x[select] <- lapply( x[select], normalize, include_bounds = include_bounds, verbose = verbose, add_transform_class = FALSE ) x } #' @export normalize.matrix <- function(x, ...) { matrix(normalize(as.numeric(x), ...), nrow = nrow(x)) } ================================================ FILE: R/ranktransform.R ================================================ #' (Signed) rank transformation #' #' Transform numeric values with the integers of their rank (i.e., 1st smallest, #' 2nd smallest, 3rd smallest, etc.). Setting the `sign` argument to `TRUE` will #' give you signed ranks, where the ranking is done according to absolute size #' but where the sign is preserved (i.e., 2, 1, -3, 4). #' #' @param x Object. #' @param sign Logical, if `TRUE`, return signed ranks. #' @param method Treatment of ties. Can be one of `"average"` (default), #' `"first"`, `"last"`, `"random"`, `"max"` or `"min"`. See [rank()] for #' details. #' @param zeros How to handle zeros. If `"na"` (default), they are marked as #' `NA`. If `"signrank"`, they are kept during the ranking and marked as zeros. #' This is only used when `sign = TRUE`. #' @param ... Arguments passed to or from other methods. #' @inheritParams extract_column_names #' @inheritParams standardize.data.frame #' #' @inheritSection center Selection of variables - the `select` argument #' #' @examples #' ranktransform(c(0, 1, 5, -5, -2)) #' #' # By default, zeros are converted to NA #' suppressWarnings( #' ranktransform(c(0, 1, 5, -5, -2), sign = TRUE) #' ) #' ranktransform(c(0, 1, 5, -5, -2), sign = TRUE, zeros = "signrank") #' #' head(ranktransform(trees)) #' @return A rank-transformed object. #' #' @family transform utilities #' #' @export ranktransform <- function(x, ...) { UseMethod("ranktransform") } #' @rdname ranktransform #' @export ranktransform.numeric <- function( x, sign = FALSE, method = "average", zeros = "na", verbose = TRUE, ... ) { # no change if all values are `NA`s if (all(is.na(x))) { return(x) } zeros <- match.arg(zeros, c("na", "signrank")) method <- match.arg( method, c("average", "first", "last", "random", "max", "min") ) # Warning if only one value and return early if (insight::has_single_value(x)) { if (is.null(names(x))) { name <- deparse(substitute(x)) } else { name <- names(x) } if (verbose) { insight::format_warning( paste0( "Variable `", name, "` contains only one unique value and will not be normalized." ) ) } return(x) } # Warning if only two values present but don't return early if (length(unique(x)) == 2L) { if (is.null(names(x))) { name <- deparse(substitute(x)) } else { name <- names(x) } if (verbose) { # nolint insight::format_warning( paste0( "Variable `", name, "` contains only two different values. Consider converting it to a factor." ) ) } } if (sign) { if (zeros == "na") { out <- rep(NA, length(x)) ZEROES <- x == 0 if (any(ZEROES) && verbose) { insight::format_warning( "Zeros detected. These cannot be sign-rank transformed." ) # nolint } out[!ZEROES] <- sign(x[!ZEROES]) * rank(abs(x[!ZEROES]), ties.method = method, na.last = "keep") } else if (zeros == "signrank") { out <- sign(x) * rank(abs(x), ties.method = method, na.last = "keep") } } else { out <- rank(x, ties.method = method, na.last = "keep") } out } #' @export ranktransform.factor <- function(x, ...) { x } #' @export ranktransform.grouped_df <- function( x, select = NULL, exclude = NULL, sign = FALSE, method = "average", ignore_case = FALSE, regex = FALSE, zeros = "na", verbose = TRUE, ... ) { info <- attributes(x) grps <- attr(x, "groups", exact = TRUE)[[".rows"]] # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) x <- as.data.frame(x) for (rows in grps) { x[rows, ] <- ranktransform( x[rows, , drop = FALSE], select = select, exclude = exclude, sign = sign, method = method, ... ) } # set back class, so data frame still works with dplyr attributes(x) <- info x } #' @rdname ranktransform #' @export ranktransform.data.frame <- function( x, select = NULL, exclude = NULL, sign = FALSE, method = "average", ignore_case = FALSE, regex = FALSE, zeros = "na", verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) x[select] <- lapply(x[select], ranktransform, sign = sign, method = method) x } ================================================ FILE: R/recode_into.R ================================================ #' @title Recode values from one or more variables into a new variable #' @name recode_into #' #' @description #' This functions recodes values from one or more variables into a new variable. #' It is a convenient function to avoid nested [`ifelse()`] statements, which #' is similar to `dplyr::case_when()`. #' #' @param ... A sequence of two-sided formulas, where the left hand side (LHS) #' is a logical matching condition that determines which values match this case. #' The LHS of this formula is also called "recode pattern" (e.g., in messages). #' The right hand side (RHS) indicates the replacement value. #' @param data Optional, name of a data frame. This can be used to avoid writing #' the data name multiple times in `...`. See 'Examples'. #' @param default Indicates the default value that is chosen when no match in #' the formulas in `...` is found. If not provided, `NA` is used as default #' value. #' @param overwrite Logical, if `TRUE` (default) and more than one recode pattern #' apply to the same case, already recoded values will be overwritten by subsequent #' recode patterns. If `FALSE`, former recoded cases will not be altered by later #' recode patterns that would apply to those cases again. A warning message is #' printed to alert such situations and to avoid unintentional recodings. #' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing #' values in the original variable will be set back to `NA` in the recoded #' variable (unless overwritten by other recode patterns). If `FALSE`, missing #' values in the original variable will be recoded to `default`. Setting #' `preserve_na = TRUE` prevents unintentional overwriting of missing values #' with `default`, which means that you won't find valid values where the #' original data only had missing values. See 'Examples'. #' @param verbose Toggle warnings. #' #' @return A vector with recoded values. #' #' @examples #' x <- 1:30 #' recode_into( #' x > 15 ~ "a", #' x > 10 & x <= 15 ~ "b", #' default = "c" #' ) #' #' x <- 1:10 #' # default behaviour: second recode pattern "x > 5" overwrites #' # some of the formerly recoded cases from pattern "x >= 3 & x <= 7" #' recode_into( #' x >= 3 & x <= 7 ~ 1, #' x > 5 ~ 2, #' default = 0, #' verbose = FALSE #' ) #' #' # setting "overwrite = FALSE" will not alter formerly recoded cases #' recode_into( #' x >= 3 & x <= 7 ~ 1, #' x > 5 ~ 2, #' default = 0, #' overwrite = FALSE, #' verbose = FALSE #' ) #' #' set.seed(123) #' d <- data.frame( #' x = sample(1:5, 30, TRUE), #' y = sample(letters[1:5], 30, TRUE), #' stringsAsFactors = FALSE #' ) #' #' # from different variables into new vector #' recode_into( #' d$x %in% 1:3 & d$y %in% c("a", "b") ~ 1, #' d$x > 3 ~ 2, #' default = 0 #' ) #' #' # no need to write name of data frame each time #' recode_into( #' x %in% 1:3 & y %in% c("a", "b") ~ 1, #' x > 3 ~ 2, #' data = d, #' default = 0 #' ) #' #' # handling of missing values #' d <- data.frame( #' x = c(1, NA, 2, NA, 3, 4), #' y = c(1, 11, 3, NA, 5, 6) #' ) #' # first NA in x is overwritten by valid value from y #' # we have no known value for second NA in x and y, #' # thus we get one NA in the result #' recode_into( #' x <= 3 ~ 1, #' y > 5 ~ 2, #' data = d, #' default = 0, #' preserve_na = TRUE #' ) #' # first NA in x is overwritten by valid value from y #' # default value is used for second NA #' recode_into( #' x <= 3 ~ 1, #' y > 5 ~ 2, #' data = d, #' default = 0, #' preserve_na = FALSE #' ) #' @export recode_into <- function( ..., data = NULL, default = NA, overwrite = TRUE, preserve_na = FALSE, verbose = TRUE ) { dots <- list(...) # get length of vector, so we know the length of the output vector len <- if (is.null(data)) { length(.dynEval(dots[[1]][[2]], ifnotfound = NULL)) } else { length(with(data, eval(dots[[1]][[2]]))) } # how many expressions (recode-formulas) do we have? n_params <- length(dots) # last expression should always be the default value if (is.null(default)) { default <- NA if (verbose) { insight::format_warning( "Default value can't be `NULL`, setting to `NA` now." ) } } # create default output vector out <- rep(default, times = len) all_recodes <- NULL all_same_length <- NULL new_values <- NULL # check recode values for (i in seq_len(n_params)) { # get type of all recode values if (is.null(data)) { value_type <- .dynEval(dots[[i]][[3]], ifnotfound = NULL) value_length <- .dynEval(dots[[i]][[2]], ifnotfound = NULL) } else { value_type <- with(data, eval(dots[[i]][[3]])) value_length <- with(data, eval(dots[[i]][[2]])) } # if we have "NA", we don't want to check the type. Else, you cannot use # "NA" for numeric recodes, but rather need to use "NA_real_", which is not # user-friendly if (is.na(value_type)) { type <- NULL } else { type <- typeof(value_type) } len_matches <- length(value_length) # save type and length of recode values all_recodes <- c(all_recodes, type) all_same_length <- c(all_same_length, len_matches) new_values <- c(new_values, value_type) } # if we have mixed types, warn user if (!is.null(all_recodes) && !all(all_recodes == all_recodes[1])) { wrong_type <- which(all_recodes != all_recodes[1]) insight::format_error( paste( "Recoding not carried out. Not all recode values are of the same type.", sprintf( "For instance, the new value of the first pattern, `%s`, is of type `%s`. The new value of the %s recode pattern, `%s`, is of type `%s`.", # nolint insight::color_text(new_values[1], "cyan"), insight::color_text(all_recodes[1], "cyan"), .number_to_text(wrong_type[1]), insight::color_text(new_values[wrong_type[1]], "cyan"), insight::color_text(all_recodes[wrong_type[1]], "cyan") ) ) ) } # all inputs of correct length? if ( !is.null(all_same_length) && !all(all_same_length == all_same_length[1]) ) { wrong_length <- which(all_same_length != all_same_length[1]) insight::format_error( "The matching conditions return vectors of different length.", paste( "Please check if all variables in your recode patterns are of the same length.", sprintf( "For instance, the first and the %s recode pattern return vectors of different length.", .number_to_text(wrong_length[1]) ) ) ) } # indicator to show message when replacing NA by default # needed to show message only once overwrite_NA_msg <- TRUE # iterate all expressions for (i in seq_len(n_params)) { # grep index of observations with replacements and replacement value if (is.null(data)) { index <- .dynEval(dots[[i]][[2]], ifnotfound = NULL) value <- .dynEval(dots[[i]][[3]], ifnotfound = NULL) } else { index <- with(data, eval(dots[[i]][[2]])) value <- with(data, eval(dots[[i]][[3]])) } # remember missing values, so we can add back later missing_index <- is.na(index) # make sure index has no missing values. when we have missing values in # original expression, these are considered as "no match" and set to FALSE # we handle NA value later and thus want to remove them from "index" now index[is.na(index)] <- FALSE # overwriting values? do more recode-patterns match the same case? if (is.na(default)) { already_exists <- !is.na(out[index]) } else { already_exists <- out[index] != default } # save indices of overwritten cases overwritten_cases <- which(index)[already_exists] # tell user... if (any(already_exists, na.rm = TRUE) && verbose) { if (overwrite) { msg <- paste( "Several recode patterns apply to the same cases.", "Some of the already recoded cases will be overwritten with new values again", sprintf( "(e.g. pattern %i overwrites the former recode of case %i).", i, overwritten_cases[1] ) ) } else { msg <- paste( "Several recode patterns apply to the same cases.", "Some of the already recoded cases will not be altered by later recode patterns.", sprintf( "(e.g. pattern %i also matches the former recode of case %i).", i, overwritten_cases[1] ) ) } insight::format_warning(msg, "Please check if this is intentional!") } # if user doesn't want to overwrite, remove already recoded indices if (!overwrite) { index[overwritten_cases] <- FALSE } # write new values into output vector out[index] <- value # set back missing values if (any(missing_index) && !is.na(default)) { if (preserve_na) { # but only where we still have default values # we don't want to overwrite already recoded values with NA out[missing_index & out == default] <- NA } else if (overwrite_NA_msg && verbose) { # don't show msg again overwrite_NA_msg <- FALSE insight::format_alert( "Missing values in original variable are overwritten by default value. If you want to preserve missing values, set `preserve_na = TRUE`." # nolint ) } } } out } .number_to_text <- function(x) { if (is.null(x) || is.na(x)) { return("") } if (x == 1) { "first" } else if (x == 2) { "second" } else if (x == 3) { "third" } else if (x == 4) { "fourth" } else if (x == 5) { "fifth" } else if (x == 21) { "twenty-first" } else if (x == 22) { "twenty-second" } else if (x == 23) { "twenty-third" } else { paste0(x, "th") } } ================================================ FILE: R/recode_values.R ================================================ #' @title Recode old values of variables into new values #' @name recode_values #' #' @description #' This functions recodes old values into new values and can be used to to #' recode numeric or character vectors, or factors. #' #' @param x A data frame, numeric or character vector, or factor. #' @param recode A list of named vectors, which indicate the recode pairs. #' The _names_ of the list-elements (i.e. the left-hand side) represent the #' _new_ values, while the values of the list-elements indicate the original #' (old) values that should be replaced. When recoding numeric vectors, #' element names have to be surrounded in backticks. For example, #' ``recode=list(`0`=1)`` would recode all `1` into `0` in a numeric #' vector. See also 'Examples' and 'Details'. #' @param default Defines the default value for all values that have no match in #' the recode-pairs. If `NULL`, original values will be preserved when there #' is no match. Note that, if `preserve_na=FALSE`, missing values (`NA`) are #' also captured by the `default` argument, and thus will also be recoded into #' the specified value. See 'Examples' and 'Details'. #' @param preserve_na Logical, if `TRUE`, `NA` (missing values) are preserved. #' This overrides any other arguments, including `default`. Hence, if #' `preserve_na=TRUE`, `default` will no longer convert `NA` into the specified #' default value. #' @param ... not used. #' @inheritParams extract_column_names #' @inheritParams categorize #' #' @return `x`, where old values are replaced by new values. #' #' @inheritSection center Selection of variables - the `select` argument #' #' @inherit data_rename seealso #' #' @note You can use `options(data_recode_pattern = "old=new")` to switch the #' behaviour of the `recode`-argument, i.e. recode-pairs are now following the #' pattern `old values = new values`, e.g. if `getOption("data_recode_pattern")` #' is set to `"old=new"`, then ``recode(`1`=0)`` would recode all 1 into 0. #' The default for ``recode(`1`=0)`` is to recode all 0 into 1. #' #' @details #' This section describes the pattern of the `recode` arguments, which also #' provides some shortcuts, in particular when recoding numeric values. #' #' - Single values #' #' Single values either need to be wrapped in backticks (in case of numeric #' values) or "as is" (for character or factor levels). Example: #' ``recode=list(`0`=1,`1`=2)`` would recode 1 into 0, and 2 into 1. #' For factors or character vectors, an example is: #' `recode=list(x="a",y="b")` (recode "a" into "x" and "b" into "y"). #' #' - Multiple values #' #' Multiple values that should be recoded into a new value can be separated #' with comma. Example: ``recode=list(`1`=c(1,4),`2`=c(2,3))`` would recode the #' values 1 and 4 into 1, and 2 and 3 into 2. It is also possible to define the #' old values as a character string, like: ``recode=list(`1`="1,4",`2`="2,3")`` #' For factors or character vectors, an example is: #' ``recode=list(x=c("a","b"),y=c("c","d"))``. #' #' - Value range #' #' Numeric value ranges can be defined using the `:`. Example: #' ``recode=list(`1`=1:3,`2`=4:6)`` would recode all values from 1 to 3 into #' 1, and 4 to 6 into 2. #' #' - `min` and `max` #' #' placeholder to use the minimum or maximum value of the #' (numeric) variable. Useful, e.g., when recoding ranges of values. #' Example: ``recode=list(`1`="min:10",`2`="11:max")``. #' #' - `default` values #' #' The `default` argument defines the default value for all values that have #' no match in the recode-pairs. For example, #' ``recode=list(`1`=c(1,2),`2`=c(3,4)), default=9`` would #' recode values 1 and 2 into 1, 3 and 4 into 2, and all other values into 9. #' If `preserve_na` is set to `FALSE`, `NA` (missing values) will also be #' recoded into the specified default value. #' #' - Reversing and rescaling #' #' See [reverse()] and [rescale()]. #' #' @examples #' # numeric ---------- #' set.seed(123) #' x <- sample(c(1:4, NA), 15, TRUE) #' table(x, useNA = "always") #' #' out <- recode_values(x, list(`0` = 1, `1` = 2:3, `2` = 4)) #' out #' table(out, useNA = "always") #' #' # to recode NA values, set preserve_na to FALSE #' out <- recode_values( #' x, #' list(`0` = 1, `1` = 2:3, `2` = 4, `9` = NA), #' preserve_na = FALSE #' ) #' out #' table(out, useNA = "always") #' #' # preserve na ---------- #' out <- recode_values(x, list(`0` = 1, `1` = 2:3), default = 77) #' out #' table(out, useNA = "always") #' #' # recode na into default ---------- #' out <- recode_values( #' x, #' list(`0` = 1, `1` = 2:3), #' default = 77, #' preserve_na = FALSE #' ) #' out #' table(out, useNA = "always") #' #' #' # factors (character vectors are similar) ---------- #' set.seed(123) #' x <- as.factor(sample(c("a", "b", "c"), 15, TRUE)) #' table(x) #' #' out <- recode_values(x, list(x = "a", y = c("b", "c"))) #' out #' table(out) #' #' out <- recode_values(x, list(x = "a", y = "b", z = "c")) #' out #' table(out) #' #' out <- recode_values(x, list(y = "b,c"), default = 77) #' # same as #' # recode_values(x, list(y = c("b", "c")), default = 77) #' out #' table(out) #' #' #' # data frames ---------- #' set.seed(123) #' d <- data.frame( #' x = sample(c(1:4, NA), 12, TRUE), #' y = as.factor(sample(c("a", "b", "c"), 12, TRUE)), #' stringsAsFactors = FALSE #' ) #' #' recode_values( #' d, #' recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = c("b", "c")), #' append = TRUE #' ) #' #' #' # switch recode pattern to "old=new" ---------- #' options(data_recode_pattern = "old=new") #' #' # numeric #' set.seed(123) #' x <- sample(c(1:4, NA), 15, TRUE) #' table(x, useNA = "always") #' #' out <- recode_values(x, list(`1` = 0, `2:3` = 1, `4` = 2)) #' table(out, useNA = "always") #' #' # factors (character vectors are similar) #' set.seed(123) #' x <- as.factor(sample(c("a", "b", "c"), 15, TRUE)) #' table(x) #' #' out <- recode_values(x, list(a = "x", `b, c` = "y")) #' table(out) #' #' # reset options #' options(data_recode_pattern = NULL) #' @export recode_values <- function(x, ...) { UseMethod("recode_values") } #' @export recode_values.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( paste0( "Variables of class `", class(x)[1], "` can't be recoded and remain unchanged." ) ) } x } #' @rdname recode_values #' @export recode_values.numeric <- function( x, recode = NULL, default = NULL, preserve_na = TRUE, verbose = TRUE, ... ) { # save original_x <- x # check arguments if (!.recode_args_ok(x, recode, verbose)) { return(x) } # recode-pattern option pattern <- getOption("data_recode_pattern") # make sure NAs are preserved after recoding missing_values <- NULL if (preserve_na) { missing_values <- is.na(x) } # check for "default" token if (!is.null(default)) { # set the default value for all values that have no match # (i.e. that should not be recoded) x <- rep(as.numeric(default), length = length(x)) } for (i in names(recode)) { # based on option-settings, the recode-argument can either follow the # pattern "new=old", or "old=new" if (identical(pattern, "old=new")) { # pattern: old = new, name of list element is old value old_values <- i new_values <- recode[[i]] } else { # pattern: new = old, name of list element is new value old_values <- recode[[i]] new_values <- i } if (is.character(old_values)) { # replace placeholder old_values <- gsub("min", min(x, na.rm = TRUE), old_values, fixed = TRUE) old_values <- gsub("max", max(x, na.rm = TRUE), old_values, fixed = TRUE) # mimic vector if (length(old_values) == 1 && !grepl("c(", old_values, fixed = TRUE)) { old_values <- paste0("c(", old_values, ")") } # parse old values, which are strings (names of element), but which should # contain values, like "1:10" or "1, 2, 3, 4". These should now be in the # format "c(1, 2, 3, 4)" or "c(1:10)", and it should be possible to parse # and evaluate these strings into a numeric vector old_values <- tryCatch( eval(parse(text = old_values)), error = function(e) NULL ) } if (!is.null(old_values) && (is.numeric(old_values) || is.na(old_values))) { x[which(original_x %in% old_values)] <- as.numeric(new_values) } } # set back variable labels, remove value labels # (these are most likely not matching anymore) attr(x, "label") <- attr(original_x, "label", exact = TRUE) attr(x, "labels") <- NULL # set back missing values if (!is.null(missing_values)) { x[missing_values] <- NA } x } #' @export recode_values.factor <- function( x, recode = NULL, default = NULL, preserve_na = TRUE, verbose = TRUE, ... ) { # save original_x <- x # check arguments if (!.recode_args_ok(x, recode, verbose)) { return(x) } # recode-pattern option pattern <- getOption("data_recode_pattern") # make sure NAs are preserved after recoding missing_values <- NULL if (preserve_na) { missing_values <- is.na(x) } # as character, so recoding works x <- as.character(x) # check for "default" token if (!is.null(default)) { # set the default value for all values that have no match # (i.e. that should not be recoded) x <- rep(as.character(default), length = length(x)) } for (i in names(recode)) { # based on option-settings, the recode-argument can either follow the # pattern "new=old", or "old=new" if (identical(pattern, "old=new")) { # pattern: old = new # name of list element is old value old_values <- paste( deparse(insight::trim_ws(unlist( strsplit(i, ",", fixed = TRUE), use.names = FALSE ))), collapse = "," ) # parse old values, which are strings (names of element), but which should # contain values, like "a" or "a, b, c". These should now be in the # format "c("a", "b", "c")" and it should be possible to parse # and evaluate these strings into a numeric vector old_values <- tryCatch( eval(parse(text = old_values)), error = function(e) NULL ) # recode x[which(original_x %in% old_values)] <- recode[[i]] } else { # pattern: new = old # name of list element is new value old_values <- as.character(recode[[i]]) # check input style: "a, b, c" if (length(old_values) == 1 && grepl(",", old_values, fixed = TRUE)) { # split and make character vector old_values <- insight::trim_ws(unlist( strsplit(old_values, ",", fixed = TRUE), use.names = FALSE )) } # recode if (identical(i, "NA")) { x[which(original_x %in% old_values)] <- NA_character_ } else { x[which(original_x %in% old_values)] <- as.character(i) } } } # set back missing values if (!is.null(missing_values)) { x[missing_values] <- NA_character_ } # make sure we have correct new levels x <- droplevels(as.factor(x)) # set back variable labels, remove value labels # (these are most likely not matching anymore) attr(x, "label") <- attr(original_x, "label", exact = TRUE) attr(x, "labels") <- NULL x } #' @export recode_values.character <- function( x, recode = NULL, default = NULL, preserve_na = TRUE, verbose = TRUE, ... ) { # save original_x <- x # check arguments if (!.recode_args_ok(x, recode, verbose)) { return(x) } # recode-pattern option pattern <- getOption("data_recode_pattern") # make sure NAs are preserved after recoding missing_values <- NULL if (preserve_na) { missing_values <- is.na(x) } # check for "default" token if (!is.null(default)) { # set the default value for all values that have no match # (i.e. that should not be recoded) x <- rep(as.character(default), length = length(x)) } for (i in names(recode)) { # based on option-settings, the recode-argument can either follow the # pattern "new=old", or "old=new" if (identical(pattern, "old=new")) { # pattern: old = new # name of list element is old value # name of list element is old value value_string <- paste( deparse(insight::trim_ws(unlist( strsplit(i, ",", fixed = TRUE), use.names = FALSE ))), collapse = "," ) # parse old values, which are strings (names of element), but which should # contain values, like "a" or "a, b, c". These should now be in the # format "c("a", "b", "c")" and it should be possible to parse # and evaluate these strings into a numeric vector old_values <- tryCatch( eval(parse(text = value_string)), error = function(e) NULL ) # recode x[which(original_x %in% old_values)] <- recode[[i]] } else { # pattern: new = old # name of list element is new value old_values <- as.character(recode[[i]]) # check input style: "a, b, c" if (length(old_values) == 1 && grepl(",", old_values, fixed = TRUE)) { # split and make character vector old_values <- insight::trim_ws(unlist( strsplit(old_values, ",", fixed = TRUE), use.names = FALSE )) } # recode if (identical(i, "NA")) { x[which(original_x %in% old_values)] <- NA_character_ } else { x[which(original_x %in% old_values)] <- as.character(i) } } } # set back variable labels, remove value labels # (these are most likely not matching anymore) attr(x, "label") <- attr(original_x, "label", exact = TRUE) attr(x, "labels") <- NULL # set back missing values if (!is.null(missing_values)) { x[missing_values] <- NA_character_ } x } #' @rdname recode_values #' @export recode_values.data.frame <- function( x, select = NULL, exclude = NULL, recode = NULL, default = NULL, preserve_na = TRUE, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments my_args <- .process_append( x, select, append, append_suffix = "_r", preserve_value_labels = TRUE ) # update processed arguments x <- my_args$x select <- my_args$select } x[select] <- lapply( x[select], recode_values, recode = recode, default = default, preserve_na = preserve_na, verbose = verbose, ... ) x } # utils -------------------------- .recode_args_ok <- function(x, recode, verbose) { ok <- TRUE # no missings valid <- stats::na.omit(x) # skip if all NA if (!length(valid)) { if (isTRUE(verbose)) { insight::format_warning( "Variable contains only missing values. No recoding carried out." ) } ok <- FALSE } # warn if not a list if (!is.list(recode) || is.null(names(recode))) { if (isTRUE(verbose)) { insight::format_warning( "`recode` needs to be a (named) list. No recoding carried out." ) } ok <- FALSE } ok } ================================================ FILE: R/remove_empty.R ================================================ #' @title Return or remove variables or observations that are completely missing #' @name remove_empty #' @rdname remove_empty #' #' @description #' #' These functions check which rows or columns of a data frame completely #' contain missing values, i.e. which observations or variables completely have #' missing values, and either (1) returns their indices; or (2) removes them #' from the data frame. #' #' @param x A data frame. #' #' @return #' #' - For `empty_columns()` and `empty_rows()`, a numeric (named) vector with row #' or column indices of those variables that completely have missing values. #' #' - For `remove_empty_columns()` and `remove_empty_rows()`, a data frame with #' "empty" columns or rows removed, respectively. #' #' - For `remove_empty()`, **both** empty rows and columns will be removed. #' #' @details For character vectors, empty string values (i.e. `""`) are also #' considered as missing value. Thus, if a character vector only contains `NA` #' and `""`, it is considered as empty variable and will be removed. Same #' applies to observations (rows) that only contain `NA` or `""`. #' #' @examples #' tmp <- data.frame( #' a = c(1, 2, 3, NA, 5), #' b = c(1, NA, 3, NA, 5), #' c = c(NA, NA, NA, NA, NA), #' d = c(1, NA, 3, NA, 5) #' ) #' #' tmp #' #' # indices of empty columns or rows #' empty_columns(tmp) #' empty_rows(tmp) #' #' # remove empty columns or rows #' remove_empty_columns(tmp) #' remove_empty_rows(tmp) #' #' # remove empty columns and rows #' remove_empty(tmp) #' #' # also remove "empty" character vectors #' tmp <- data.frame( #' a = c(1, 2, 3, NA, 5), #' b = c(1, NA, 3, NA, 5), #' c = c("", "", "", "", ""), #' stringsAsFactors = FALSE #' ) #' empty_columns(tmp) #' #' @export empty_columns <- function(x) { if ((!is.matrix(x) && !is.data.frame(x)) || ncol(x) < 2) { vector("numeric") } else { all_na <- colSums(is.na(x)) == nrow(x) all_empty <- vapply( x, function(i) { (is.character(i) || is.factor(i)) && !any(nzchar(as.character(i[!is.na(i)]))) }, FUN.VALUE = logical(1L) ) which(all_na | all_empty) } } #' @rdname remove_empty #' @export empty_rows <- function(x) { if ((!is.matrix(x) && !is.data.frame(x)) || nrow(x) < 2) { vector("numeric") } else { which(rowSums((is.na(x) | x == "")) == ncol(x)) # nolint } } #' @rdname remove_empty #' @export remove_empty_columns <- function(x) { # check if we have any empty columns at all ec <- empty_columns(x) # if yes, removing works, else an empty df would be returned if (length(ec)) { x <- x[-ec] } x } #' @rdname remove_empty #' @export remove_empty_rows <- function(x) { # check if we have any empty rows at all er <- empty_rows(x) # if yes, removing works, else an empty df would be returned if (length(er)) { attr_data <- attributes(x) x <- x[-er, ] x <- .replace_attrs(x, attr_data) } x } #' @rdname remove_empty #' @export remove_empty <- function(x) { x <- remove_empty_rows(x) x <- remove_empty_columns(x) x } ================================================ FILE: R/replace_nan_inf.R ================================================ #' @title Convert infinite or `NaN` values into `NA` #' @name replace_nan_inf #' #' @description #' Replaces all infinite (`Inf` and `-Inf`) or `NaN` values with `NA`. #' #' @param x A vector or a dataframe #' @param ... Currently not used. #' #' @return #' Data with `Inf`, `-Inf`, and `NaN` converted to `NA`. #' #' @examples #' # a vector #' x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7) #' replace_nan_inf(x) #' #' # a data frame #' df <- data.frame( #' x = c(1, NA, 5, Inf, 2, NA), #' y = c(3, NaN, 4, -Inf, 6, 7), #' stringsAsFactors = FALSE #' ) #' replace_nan_inf(df) #' @export replace_nan_inf <- function(x, ...) { UseMethod("replace_nan_inf") } #' @export replace_nan_inf.default <- function(x, ...) { x[is.nan(x) | is.infinite(x)] <- NA x } #' @inheritParams extract_column_names #' @export replace_nan_inf.data.frame <- function( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # Select and deselect cols <- .select_nse( select, x, exclude = exclude, ignore_case, regex = regex, verbose = verbose ) for (i in cols) { x[[i]] <- replace_nan_inf(x[[i]]) } x } ================================================ FILE: R/rescale_weights.R ================================================ #' @title Rescale design weights for multilevel analysis #' @name rescale_weights #' #' @description Most functions to fit multilevel and mixed effects models only #' allow the user to specify frequency weights, but not design (i.e., sampling #' or probability) weights, which should be used when analyzing complex samples #' (e.g., probability samples). `rescale_weights()` implements two algorithms, #' one proposed by \cite{Asparouhov (2006)} and \cite{Carle (2009)}, to rescale #' design weights in survey data to account for the grouping structure of #' multilevel models, and one based on the design effect proposed by #' \cite{Kish (1965)}, to rescale weights by the design effect to account for #' additional sampling error introduced by weighting. #' @param data A data frame. #' @param by Variable names (as character vector, or as formula), indicating #' the grouping structure (strata) of the survey data (level-2-cluster #' variable). It is also possible to create weights for multiple group #' variables; in such cases, each created weighting variable will be suffixed #' by the name of the group variable. This argument is required for #' `method = "carle"`, but optional for `method = "kish"`. #' @param probability_weights Variable indicating the probability (design or #' sampling) weights of the survey data (level-1-weight), provided as character #' string or formula. #' @param nest Logical, if `TRUE` and `by` indicates at least two group #' variables, then groups are "nested", i.e. groups are now a combination from #' each group level of the variables in `by`. This argument is not used when #' `method = "kish"`. #' @param method String, indicating which rescale-method is used for rescaling #' weights. Can be either `"carle"` (default) or `"kish"`. See 'Details'. If #' `method = "carle"`, the `by` argument is required. #' #' @return #' `data`, including the new weighting variable(s). For `method = "carle"`, new #' columns `rescaled_weights_a` and `rescaled_weights_b` are returned, and for #' `method = "kish"`, the returned data contains a column `rescaled_weights`. #' These represent the rescaled design weights to use in multilevel models (use #' these variables for the `weights` argument). #' #' @details #' - `method = "carle"` #' #' Rescaling is based on two methods: For `rescaled_weights_a`, the sample #' weights `probability_weights` are adjusted by a factor that represents the #' proportion of group size divided by the sum of sampling weights within each #' group. The adjustment factor for `rescaled_weights_b` is the sum of sample #' weights within each group divided by the sum of squared sample weights #' within each group (see Carle (2009), Appendix B). In other words, #' `rescaled_weights_a` "scales the weights so that the new weights sum to the #' cluster sample size" while `rescaled_weights_b` "scales the weights so that #' the new weights sum to the effective cluster size". #' #' Regarding the choice between scaling methods A and B, Carle suggests that #' "analysts who wish to discuss point estimates should report results based #' on weighting method A. For analysts more interested in residual #' between-group variance, method B may generally provide the least biased #' estimates". In general, it is recommended to fit a non-weighted model and #' weighted models with both scaling methods and when comparing the models, #' see whether the "inferential decisions converge", to gain confidence in the #' results. #' #' Though the bias of scaled weights decreases with increasing group size, #' method A is preferred when insufficient or low group size is a concern. #' #' The group ID and probably PSU may be used as random effects (e.g. nested #' design, or group and PSU as varying intercepts), depending on the survey #' design that should be mimicked. #' #' - `method = "kish"` #' #' Rescaling is based on scaling the sample weights so the mean value is 1, #' which means the sum of all weights equals the sample size. Next, the design #' effect (_Kish 1965_) is calculated, which is the mean of the squared #' weights divided by the squared mean of the weights. The scaled sample #' weights are then divided by the design effect. This method is most #' appropriate when weights are based on additional variables beyond the #' grouping variables in the model (e.g., other demographic characteristics), #' but may also be useful in other contexts. #' #' Some tests on real-world survey-data suggest that, in comparison to the #' Carle-method, the Kish-method comes closer to estimates from a regular #' survey-design using the **survey** package. Note that these tests are not #' representative and it is recommended to check your results against a #' standard survey-design. #' #' @references #' - Asparouhov T. (2006). General Multi-Level Modeling with Sampling #' Weights. Communications in Statistics - Theory and Methods 35: 439-460 #' #' - Carle A.C. (2009). Fitting multilevel models in complex survey data #' with design weights: Recommendations. BMC Medical Research Methodology #' 9(49): 1-13 #' #' - Kish, L. (1965) Survey Sampling. London: Wiley. #' #' @examplesIf all(insight::check_if_installed(c("lme4", "parameters"), quietly = TRUE)) #' data(nhanes_sample) #' head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")) #' #' # also works with multiple group-variables #' head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU"))) #' #' # or nested structures. #' x <- rescale_weights( #' data = nhanes_sample, #' probability_weights = "WTINT2YR", #' by = c("SDMVSTRA", "SDMVPSU"), #' nest = TRUE #' ) #' head(x) #' #' \donttest{ #' # compare different methods, using multilevel-Poisson regression #' #' d <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") #' result1 <- lme4::glmer( #' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), #' family = poisson(), #' data = d, #' weights = rescaled_weights_a #' ) #' result2 <- lme4::glmer( #' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), #' family = poisson(), #' data = d, #' weights = rescaled_weights_b #' ) #' #' d <- rescale_weights( #' nhanes_sample, #' "WTINT2YR", #' method = "kish" #' ) #' result3 <- lme4::glmer( #' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), #' family = poisson(), #' data = d, #' weights = rescaled_weights #' ) #' d <- rescale_weights( #' nhanes_sample, #' "WTINT2YR", #' "SDMVSTRA", #' method = "kish" #' ) #' result4 <- lme4::glmer( #' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), #' family = poisson(), #' data = d, #' weights = rescaled_weights #' ) #' parameters::compare_parameters( #' list(result1, result2, result3, result4), #' exponentiate = TRUE, #' column_names = c("Carle (A)", "Carle (B)", "Kish", "Kish (grouped)") #' ) #' } #' @export rescale_weights <- function( data, probability_weights = NULL, by = NULL, nest = FALSE, method = "carle" ) { method <- insight::validate_argument(method, c("carle", "kish")) # convert formulas to strings if (inherits(by, "formula")) { by <- all.vars(by) } if (inherits(probability_weights, "formula")) { probability_weights <- all.vars(probability_weights) } # check for existing variable names if ( (method == "carle" && any(c("rescaled_weights_a", "rescaled_weights_b") %in% colnames(data))) || (method == "kish" && "rescaled_weights" %in% colnames(data)) ) { insight::format_warning( "The variable name for the rescaled weights already exists in the data. Returned columns will be renamed into unique names." ) # nolint } # need probability_weights if (is.null(probability_weights)) { insight::format_error( "The argument `probability_weights` is missing, but required to rescale weights." ) } # check if weight has missings. we need to remove them first, # and add back weights to correct cases later weight_missings <- which(is.na(data[[probability_weights]])) weight_non_na <- which(!is.na(data[[probability_weights]])) if (length(weight_missings) > 0) { data_tmp <- data[weight_non_na, ] } else { data_tmp <- data } fun_args <- list( nest = nest, probability_weights = probability_weights, data_tmp = data_tmp, data = data, by = by, weight_non_na = weight_non_na ) switch( method, carle = do.call(.rescale_weights_carle, fun_args), do.call(.rescale_weights_kish, fun_args) ) } # rescale weights, method Kish ---------------------------- .rescale_weights_kish <- function( nest, probability_weights, data_tmp, data, by, weight_non_na ) { # sort id data_tmp$.bamboozled <- seq_len(nrow(data_tmp)) # `nest` is currently ignored if (isTRUE(nest)) { insight::format_warning( "Argument `nest` is ignored for `method = \"kish\"`." ) } # check by argument if (!is.null(by) && !all(by %in% colnames(data_tmp))) { dont_exist <- setdiff(by, colnames(data_tmp)) insight::format_error( paste0( "The following variable(s) specified in `by` don't exist in the dataset: ", text_concatenate(dont_exist), "." ), .misspelled_string(colnames(data_tmp), dont_exist, "Possibly misspelled?") ) } else if (is.null(by)) { # if `by` = NULL, we create a dummy group by <- "tmp_kish_by" data_tmp[[by]] <- 1 } # split into groups, and calculate weights out <- lapply(split(data_tmp, data_tmp[by]), function(group_data) { p_weights <- group_data[[probability_weights]] # design effect according to Kish deff <- mean(p_weights^2) / (mean(p_weights)^2) # rescale weights, so their mean is 1 z_weights <- p_weights * (1 / mean(p_weights)) # divide weights by design effect group_data$rescaled_weights <- z_weights / deff group_data }) # bind data result <- do.call(rbind, out) # restore original order result <- result[order(result$.bamboozled), ] # add back rescaled weights to original data, but account for missing observations data$rescaled_weights <- NA_real_ data$rescaled_weights[weight_non_na] <- result$rescaled_weights # return result data } # rescale weights, method Carle ---------------------------- .rescale_weights_carle <- function( nest, probability_weights, data_tmp, data, by, weight_non_na ) { # sort id data_tmp$.bamboozled <- seq_len(nrow(data_tmp)) if (is.null(by)) { insight::format_error( "Argument `by` must be specified. Please provide one or more variable names in `by` that indicate the grouping structure (strata) of the survey data (level-2-cluster variable)." ) # nolint } if (!all(by %in% colnames(data_tmp))) { dont_exist <- setdiff(by, colnames(data_tmp)) insight::format_error( paste0( "The following variable(s) specified in `by` don't exist in the dataset: ", text_concatenate(dont_exist), "." ), .misspelled_string(colnames(data_tmp), dont_exist, "Possibly misspelled?") ) } if (nest && length(by) < 2) { insight::format_warning( sprintf( "Only one group variable selected in `by`, no nested structure possible. Rescaling weights for grout '%s' now.", by ) ) nest <- FALSE } if (nest) { out <- .rescale_weights_nested( data_tmp, group = by, probability_weights, nrow(data), weight_non_na ) } else { out <- lapply(by, function(i) { x <- .rescale_weights( data_tmp, i, probability_weights, nrow(data), weight_non_na ) if (length(by) > 1) { colnames(x) <- sprintf(c("pweight_a_%s", "pweight_b_%s"), i) } x }) } make_unique_names <- any(vapply( out, function(i) any(colnames(i) %in% colnames(data)), logical(1) )) # add weights to data frame out <- do.call(cbind, list(data, out)) # check if we have to rename columns if (make_unique_names) { colnames(out) <- make.unique(colnames(out), sep = "_") } out } # rescale weights, for one or more group variables ---------------------------- .rescale_weights <- function(x, group, probability_weights, n, weight_non_na) { # compute sum of weights per group design_weights <- .data_frame( group = sort(unique(x[[group]])), sum_weights_by_group = tapply( x[[probability_weights]], as.factor(x[[group]]), sum ), sum_squared_weights_by_group = tapply( x[[probability_weights]]^2, as.factor(x[[group]]), sum ), n_per_group = as.vector(table(x[[group]])) ) colnames(design_weights)[1] <- group x <- merge(x, design_weights, by = group, sort = FALSE) # restore original order x <- x[order(x$.bamboozled), ] x$.bamboozled <- NULL # multiply the original weight by the fraction of the # sampling unit total population based on Carle 2009 w_a <- x[[probability_weights]] * x$n_per_group / x$sum_weights_by_group w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group out <- data.frame( rescaled_weights_a = rep(NA_real_, times = n), rescaled_weights_b = rep(NA_real_, times = n) ) out$rescaled_weights_a[weight_non_na] <- w_a out$rescaled_weights_b[weight_non_na] <- w_b out } # rescale weights, for nested groups ---------------------------- .rescale_weights_nested <- function( x, group, probability_weights, n, weight_non_na ) { groups <- expand.grid(lapply(group, function(i) sort(unique(x[[i]])))) colnames(groups) <- group # compute sum of weights per group design_weights <- cbind( groups, .data_frame( sum_weights_by_group = unlist( as.list(tapply( x[[probability_weights]], lapply(group, function(i) { as.factor(x[[i]]) }), sum )), use.names = FALSE ), sum_squared_weights_by_group = unlist( as.list(tapply( x[[probability_weights]]^2, lapply(group, function(i) { as.factor(x[[i]]) }), sum )), use.names = FALSE ), n_per_group = unlist(as.list(table(x[, group])), use.names = FALSE) ) ) x <- merge(x, design_weights, by = group, sort = FALSE) # restore original order x <- x[order(x$.bamboozled), ] x$.bamboozled <- NULL # multiply the original weight by the fraction of the # sampling unit total population based on Carle 2009 w_a <- x[[probability_weights]] * x$n_per_group / x$sum_weights_by_group w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group out <- data.frame( rescaled_weights_a = rep(NA_real_, times = n), rescaled_weights_b = rep(NA_real_, times = n) ) out$rescaled_weights_a[weight_non_na] <- w_a out$rescaled_weights_b[weight_non_na] <- w_b out } ================================================ FILE: R/reshape_ci.R ================================================ #' Reshape CI between wide/long formats #' #' Reshape CI between wide/long formats. #' #' @param x A data frame containing columns named `CI_low` and `CI_high` (or #' similar, see `ci_type`). #' @param ci_type String indicating the "type" (i.e. prefix) of the interval #' columns. Per *easystats* convention, confidence or credible intervals are #' named `CI_low` and `CI_high`, and the related `ci_type` would be `"CI"`. #' If column names for other intervals differ, `ci_type` can be used to #' indicate the name, e.g. `ci_type = "SI"` can be used for support intervals, #' where the column names in the data frame would be `SI_low` and `SI_high`. #' #' @return #' #' A data frame with columns corresponding to confidence intervals reshaped #' either to wide or long format. #' #' @examples #' x <- data.frame( #' Parameter = c("Term 1", "Term 2", "Term 1", "Term 2"), #' CI = c(0.8, 0.8, 0.9, 0.9), #' CI_low = c(0.2, 0.3, 0.1, 0.15), #' CI_high = c(0.5, 0.6, 0.8, 0.85), #' stringsAsFactors = FALSE #' ) #' #' reshape_ci(x) #' reshape_ci(reshape_ci(x)) #' @export reshape_ci <- function(x, ci_type = "CI") { # define interval type ci_type <- match.arg(ci_type, choices = c("CI", "SI", "HDI", "ETI")) ci_low <- paste0(ci_type, "_low") ci_high <- paste0(ci_type, "_high") # Long to wide ---------------- if (ci_low %in% names(x) && ci_high %in% names(x) && "CI" %in% names(x)) { ci_position <- which(names(x) == "CI") # Reshape if (length(unique(x$CI)) > 1) { if ("Parameter" %in% names(x)) { idvar <- "Parameter" remove_parameter <- FALSE } else if (is.null(attr(x, "idvars"))) { idvar <- "Parameter" x$Parameter <- NA remove_parameter <- TRUE } else { idvar <- attr(x, "idvars") remove_parameter <- FALSE } x <- stats::reshape( x, idvar = idvar, timevar = "CI", direction = "wide", v.names = c(ci_low, ci_high), sep = "_" ) row.names(x) <- NULL if (remove_parameter) x$Parameter <- NULL } # Replace at the right place ci_colname <- names(x)[ grepl(paste0(ci_low, "_*"), names(x)) | grepl(paste0(ci_high, "_*"), names(x)) ] colnames_1 <- names(x)[0:(ci_position - 1)][ !names(x)[0:(ci_position - 1)] %in% ci_colname ] colnames_2 <- names(x)[!names(x) %in% c(ci_colname, colnames_1)] x <- x[c(colnames_1, ci_colname, colnames_2)] # Wide to long -------------- } else { if ("Parameter" %in% names(x)) { remove_parameter <- FALSE } else { x$Parameter <- seq_len(nrow(x)) remove_parameter <- TRUE } lows <- grepl(paste0(ci_low, "_*"), names(x)) highs <- grepl(paste0(ci_high, "_*"), names(x)) ci <- as.numeric(gsub(paste0(ci_low, "_"), "", names(x)[lows])) if ( paste(ci, collapse = "-") != paste(gsub(paste0(ci_high, "_"), "", names(x)[highs]), collapse = "-") ) { insight::format_error("Something went wrong in the CIs reshaping.") return(x) } if (sum(lows) > 1 && sum(highs) > 1) { low <- stats::reshape( x[!highs], direction = "long", varying = list(names(x)[lows]), sep = "_", timevar = "CI", v.names = ci_low, times = ci ) high <- stats::reshape( x[!lows], direction = "long", varying = list(names(x)[highs]), sep = "_", timevar = "CI", v.names = ci_high, times = ci ) x <- merge(low, high) x$id <- NULL x <- x[order(x$Parameter), ] row.names(x) <- NULL if (remove_parameter) x$Parameter <- NULL } # Replace at the right place ci_position <- which(lows)[1] ci_colname <- c("CI", ci_low, ci_high) colnames_1 <- names(x)[0:(ci_position - 1)][ !names(x)[0:(ci_position - 1)] %in% ci_colname ] colnames_2 <- names(x)[!names(x) %in% c(ci_colname, colnames_1)] x <- x[c(colnames_1, ci_colname, colnames_2)] } class(x) <- intersect(c("data.frame", "numeric"), class(x)) x } ================================================ FILE: R/row_count.R ================================================ #' @title Count specific values row-wise #' @name row_count #' @description `row_count()` mimics base R's `rowSums()`, with sums for a #' specific value indicated by `count`. Hence, it is similar to #' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including #' strict comparisons. Comparisons using `==` coerce values to atomic vectors, #' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also #' possible to make "type safe" comparisons using the `allow_coercion` argument, #' where `"2" == 2` is not true. #' #' @param data A data frame with at least two columns, where number of specific #' values are counted row-wise. #' @param count The value for which the row sum should be computed. May be a #' numeric value, a character string (for factors or character vectors), `NA` or #' `Inf`. #' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same #' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). #' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In #' order to count factor levels in the data, use `count = factor("level")`. See #' 'Examples'. #' #' @inheritParams extract_column_names #' @inheritParams row_means #' #' @return A vector with row-wise counts of values specified in `count`. #' #' @examples #' dat <- data.frame( #' c1 = c(1, 2, NA, 4), #' c2 = c(NA, 2, NA, 5), #' c3 = c(NA, 4, NA, NA), #' c4 = c(2, 3, 7, 8) #' ) #' #' # count all 4s per row #' row_count(dat, count = 4) #' # count all missing values per row #' row_count(dat, count = NA) #' #' dat <- data.frame( #' c1 = c("1", "2", NA, "3"), #' c2 = c(NA, "2", NA, "3"), #' c3 = c(NA, 4, NA, NA), #' c4 = c(2, 3, 7, Inf) #' ) #' # count all 2s and "2"s per row #' row_count(dat, count = 2) #' # only count 2s, but not "2"s #' row_count(dat, count = 2, allow_coercion = FALSE) #' #' dat <- data.frame( #' c1 = factor(c("1", "2", NA, "3")), #' c2 = c("2", "1", NA, "3"), #' c3 = c(NA, 4, NA, NA), #' c4 = c(2, 3, 7, Inf) #' ) #' # find only character "2"s #' row_count(dat, count = "2", allow_coercion = FALSE) #' # find only factor level "2"s #' row_count(dat, count = factor("2"), allow_coercion = FALSE) #' #' @export row_count <- function( data, select = NULL, exclude = NULL, count = NULL, allow_coercion = TRUE, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) { # evaluate arguments select <- .select_nse( select, data, exclude, ignore_case = ignore_case, regex = regex, verbose = verbose ) if (is.null(count)) { insight::format_error( "`count` must be a valid value (including `NA` or `Inf`), but not `NULL`." ) } if (is.null(select) || length(select) == 0) { insight::format_error("No columns selected.") } data <- .coerce_to_dataframe(data[select]) # check if we have a data framme with at least two columns if (nrow(data) < 1) { insight::format_error("`data` must be a data frame with at least one row.") } # check if we have a data framme with at least two columns if (ncol(data) < 2) { insight::format_error( "`data` must be a data frame with at least two numeric columns." ) } # special case: count missing if (is.na(count)) { rowSums(is.na(data)) } else { # comparisons in R using == coerce values into a atomic vector, i.e. # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or # "2" == "2" (i.e. we want exact types to be compared only) if (isFALSE(allow_coercion)) { # we need the "type" of the count-value - we use class() instead of typeof(), # because the latter sometimes returns unsuitable classes/types. compare # typeof(as.Date("2020-01-01")), which returns "double". count_type <- class(count)[1] valid_columns <- vapply(data, inherits, TRUE, what = count_type) # check if any columns left? if (!any(valid_columns)) { insight::format_error( "No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`." ) # nolint } data <- data[valid_columns] } # coerce - we have only valid columns anyway, and we need to coerce factors # to vectors, else comparison with `==` errors. count <- as.vector(count) # finally, count rowSums(data == count, na.rm = TRUE) } } ================================================ FILE: R/row_means.R ================================================ #' @title Row means or sums (optionally with minimum amount of valid values) #' @name row_means #' @description This function is similar to the SPSS `MEAN.n` or `SUM.n` #' function and computes row means or row sums from a data frame or matrix if at #' least `min_valid` values of a row are valid (and not `NA`). #' #' @param data A data frame with at least two columns, where row means or row #' sums are applied. #' @param min_valid Optional, a numeric value of length 1. May either be #' - a numeric value that indicates the amount of valid values per row to #' calculate the row mean or row sum; #' - or a value between `0` and `1`, indicating a proportion of valid values per #' row to calculate the row mean or row sum (see 'Details'). #' - `NULL` (default), in which all cases are considered. #' #' If a row's sum of valid values is less than `min_valid`, `NA` will be returned. #' @param digits Numeric value indicating the number of decimal places to be #' used for rounding mean values. Negative values are allowed (see 'Details'). #' By default, `digits = NULL` and no rounding is used. #' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values #' before calculating row means or row sums. Only applies if `min_valid` is not #' specified. #' @param verbose Toggle warnings. #' @inheritParams extract_column_names #' #' @return A vector with row means (for `row_means()`) or row sums (for #' `row_sums()`) for those rows with at least `n` valid values. #' #' @details Rounding to a negative number of `digits` means rounding to a power #' of ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest #' hundred. For `min_valid`, if not `NULL`, `min_valid` must be a numeric value #' from `0` to `ncol(data)`. If a row in the data frame has at least `min_valid` #' non-missing values, the row mean or row sum is returned. If `min_valid` is a #' non-integer value from 0 to 1, `min_valid` is considered to indicate the #' proportion of required non-missing values per row. E.g., if #' `min_valid = 0.75`, a row must have at least `ncol(data) * min_valid` #' non-missing values for the row mean or row sum to be calculated. See #' 'Examples'. #' #' @examples #' dat <- data.frame( #' c1 = c(1, 2, NA, 4), #' c2 = c(NA, 2, NA, 5), #' c3 = c(NA, 4, NA, NA), #' c4 = c(2, 3, 7, 8) #' ) #' #' # default, all means are shown, if no NA values are present #' row_means(dat) #' #' # remove all NA before computing row means #' row_means(dat, remove_na = TRUE) #' #' # needs at least 4 non-missing values per row #' row_means(dat, min_valid = 4) # 1 valid return value #' row_sums(dat, min_valid = 4) # 1 valid return value #' #' # needs at least 3 non-missing values per row #' row_means(dat, min_valid = 3) # 2 valid return values #' #' # needs at least 2 non-missing values per row #' row_means(dat, min_valid = 2) #' #' # needs at least 1 non-missing value per row, for two selected variables #' row_means(dat, select = c("c1", "c3"), min_valid = 1) #' #' # needs at least 50% of non-missing values per row #' row_means(dat, min_valid = 0.5) # 3 valid return values #' row_sums(dat, min_valid = 0.5) #' #' # needs at least 75% of non-missing values per row #' row_means(dat, min_valid = 0.75) # 2 valid return values #' #' @export row_means <- function( data, select = NULL, exclude = NULL, min_valid = NULL, digits = NULL, ignore_case = FALSE, regex = FALSE, remove_na = FALSE, verbose = TRUE ) { # evaluate arguments select <- .select_nse( select, data, exclude, ignore_case = ignore_case, regex = regex, verbose = verbose ) # prepare data, sanity checks data <- .prepare_row_data(data, select, min_valid, verbose) # calculate row means .row_sums_or_means(data, min_valid, digits, remove_na, fun = "mean") } #' @rdname row_means #' @export row_sums <- function( data, select = NULL, exclude = NULL, min_valid = NULL, digits = NULL, ignore_case = FALSE, regex = FALSE, remove_na = FALSE, verbose = TRUE ) { # evaluate arguments select <- .select_nse( select, data, exclude, ignore_case = ignore_case, regex = regex, verbose = verbose ) # prepare data, sanity checks data <- .prepare_row_data(data, select, min_valid, verbose) # calculate row sums .row_sums_or_means(data, min_valid, digits, remove_na, fun = "sum") } # helper ------------------------ # calculate row means or sums .row_sums_or_means <- function(data, min_valid, digits, remove_na, fun) { if (is.null(min_valid)) { # calculate row means or sums for complete data out <- switch( fun, mean = rowMeans(data, na.rm = remove_na), rowSums(data, na.rm = remove_na) ) } else { # is 'min_valid' indicating a proportion? decimals <- min_valid %% 1 if (decimals != 0) { min_valid <- round(ncol(data) * decimals) } # min_valid may not be larger as df's amount of columns if (ncol(data) < min_valid) { insight::format_error( "`min_valid` must be smaller or equal to number of columns in data frame." ) } # row means or sums to_na <- rowSums(is.na(data)) > ncol(data) - min_valid out <- switch( fun, mean = rowMeans(data, na.rm = TRUE), rowSums(data, na.rm = TRUE) ) out[to_na] <- NA } # round, if requested if (!is.null(digits) && !all(is.na(digits))) { out <- round(out, digits = digits) } out } # check that data is in shape for row means or row sums .prepare_row_data <- function(data, select, min_valid, verbose) { if (is.null(select) || length(select) == 0) { insight::format_error("No columns selected.") } data <- .coerce_to_dataframe(data[select]) # n must be a numeric, non-missing value if ( !is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1) ) { insight::format_error("`min_valid` must be a numeric value of length 1.") } # make sure we only have numeric values numeric_columns <- vapply(data, is.numeric, TRUE) if (!all(numeric_columns)) { if (verbose) { insight::format_alert( "Only numeric columns are considered for calculation." ) } data <- data[numeric_columns] } # check if we have a data framme with at least two columns if (ncol(data) < 2) { insight::format_error( "`data` must be a data frame with at least two numeric columns." ) } data } ================================================ FILE: R/select_nse.R ================================================ # Code adapted from {poorman} by Nathan Eastwood [License: MIT] # https://github.com/nathaneastwood/poorman/blob/master/R/select_positions.R .select_nse <- function( select, data, exclude, ignore_case, regex = FALSE, remove_group_var = FALSE, allow_rename = FALSE, verbose = FALSE, ifnotfound = "warn" ) { .check_data(data) columns <- colnames(data) # avoid conflicts conflicting_packages <- .conflicting_packages("poorman") on.exit(.attach_packages(conflicting_packages)) expr_select <- substitute(select, env = parent.frame(1L)) expr_exclude <- substitute(exclude, env = parent.frame(1L)) # when exclude is not an argument called from the function (e.g data_to_long), # do not consider "exclude" as a symbol if (deparse(expr_exclude) == "exclude" && is.null(substitute(exclude))) { expr_exclude <- NULL } # for grouped data frames, we can decide to remove group variable from selection grp_vars <- setdiff(colnames(attr(data, "groups", exact = TRUE)), ".rows") # directly return all names if select == exclude == NULL if (is.null(expr_select) && is.null(expr_exclude)) { # don't include grouping variables if (remove_group_var) { columns <- setdiff(columns, grp_vars) } return(columns) } # get the position of columns that are selected or excluded selected <- .eval_expr( expr_select, data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) excluded <- .eval_expr( expr_exclude, data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) selected_has_mix_idx <- any(selected < 0L) && any(selected > 0L) excluded_has_mix_idx <- any(excluded < 0L) && any(excluded > 0L) if (selected_has_mix_idx || excluded_has_mix_idx) { insight::format_error( "You can't mix negative and positive indices in `select` or `exclude`." ) } # variable positions -> variable names selected <- columns[selected] excluded <- columns[excluded] if (length(selected) == 0L) { if (length(excluded) == 0L) { out <- character(0L) } else { out <- setdiff(columns, excluded) } } else { out <- setdiff(selected, excluded) } # don't include grouping variables if (remove_group_var && length(out)) { out <- setdiff(out, grp_vars) } # for named character vectors, we offer the service to rename the columns if (allow_rename && typeof(expr_select) == "language") { # safe evaluation of the expression, to get the named vector from "select" new_names <- tryCatch(eval(expr_select), error = function(e) NULL) # check if we really have a named vector if (!is.null(new_names) && !is.null(names(new_names))) { # if so, copy names all_names <- names(new_names) # if some of the elements don't have a name, we set the value as name names(new_names)[!nzchar(all_names)] <- new_names[!nzchar(all_names)] # after inclusion and exclusion, the original values in "select" # may have changed, so we check that we only add names of valid values out <- stats::setNames(out, names(new_names)[new_names %in% out]) # check if we have any duplicated names, and if so, give an error if (anyDuplicated(names(out)) > 0) { insight::format_error(paste0( "Following names are duplicated after renaming: ", text_concatenate(names(out)[duplicated(names(out))], enclose = "`"), ". Using duplicated names is no good practice and therefore discouraged. Please provide unique names." )) } } } out } # This is where we dispatch the expression to several helper functions. # This function is called multiple times for expressions that are composed # of several symbols/language. # # Ex: # * "cyl" -> will go to .select_char() and will return directly # * cyl:gear -> function (`:`) so find which function it is, then get the # position for each variable, then evaluate the function with the positions .eval_expr <- function(x, data, ignore_case, regex, verbose, ifnotfound) { if (is.null(x)) { return(NULL) } type <- typeof(x) out <- switch( type, integer = x, double = as.integer(x), character = .select_char( data, x, ignore_case, regex = regex, verbose, ifnotfound ), symbol = .select_symbol( data, x, ignore_case, regex = regex, verbose, ifnotfound ), language = .eval_call( data, x, ignore_case, regex = regex, verbose, ifnotfound ), insight::format_error(paste0( "Expressions of type <", typeof(x), "> cannot be evaluated for use when subsetting." )) ) out } # Possibilities: # - quoted variable name # - quoted variable name with ignore case # - quoted variable name with colon, to indicate range # - character that should be regex-ed on variable names # - special word "all" to return all vars .select_char <- function(data, x, ignore_case, regex, verbose, ifnotfound) { # use colnames because names() doesn't work for matrices columns <- colnames(data) if (isTRUE(regex)) { # string is a regular expression grep(x, columns) } else if (length(x) == 1L && x == "all") { # string is "all" - select all columns seq_along(data) } else if (any(grepl(":", x, fixed = TRUE))) { # special pattern, as string (e.g.select = c("cyl:hp", "am")). However, # this will first go into `.eval_call()` and thus only single elements # are passed in `x` - we have never a character *vector* here # check for valid names colon_vars <- unlist(strsplit(x, ":", fixed = TRUE)) colon_match <- match(colon_vars, columns) if (anyNA(colon_match)) { .action_if_not_found( colon_vars, columns, colon_match, verbose, ifnotfound ) matches <- NA } else { start_pos <- match(colon_vars[1], columns) end_pos <- match(colon_vars[2], columns) if (!is.na(start_pos) && !is.na(end_pos)) { matches <- start_pos:end_pos } else { matches <- NA } } matches[!is.na(matches)] } else if (isTRUE(ignore_case)) { # find columns, case insensitive matches <- match(toupper(x), toupper(columns)) matches[!is.na(matches)] } else { # find columns, case sensitive matches <- match(x, columns) if (anyNA(matches)) { .action_if_not_found(x, columns, matches, verbose, ifnotfound) } matches[!is.na(matches)] } } # small helper, to avoid duplicated code .action_if_not_found <- function(x, columns, matches, verbose, ifnotfound) { msg <- paste0( "Following variable(s) were not found: ", toString(x[is.na(matches)]) ) msg2 <- .misspelled_string( columns, x[is.na(matches)], default_message = "Possibly misspelled?" ) if (ifnotfound == "error") { insight::format_error(msg, msg2) } if (ifnotfound == "warn" && verbose) { insight::format_warning(msg, msg2) } } # 3 types of symbols: # - unquoted variables # - objects that need to be evaluated, e.g data_find(iris, i) where # i is a function arg or is defined before. This can also be a # vector of names or positions. # - functions (without parenthesis) # The first case is easy to deal with. # For the 2nd one, we try to get the value of the object at each environment # (starting from the lower one) until the global environment. If we get its # value but it errors because the function doesn't exist then it means that # it is a select helper that we grab from the error message. .select_symbol <- function(data, x, ignore_case, regex, verbose, ifnotfound) { # We use `tryCatch()` instead of `try()` here, because for grouped data frame # methods, `.dynEval()` can be called many times. Since `tryCatch()` is minimal # faster than `try()`, we get a performance "boost" of some seconds for large # data frames with many groups (see https://github.com/easystats/datawizard/pull/657/) try_eval <- tryCatch(eval(x), error = function(e) NULL) x_dep <- insight::safe_deparse(x) is_select_helper <- FALSE out <- NULL if (x_dep %in% colnames(data)) { matches <- match(x_dep, colnames(data)) out <- matches[!is.na(matches)] } else if (isTRUE(ignore_case)) { matches <- match(toupper(x_dep), toupper(colnames(data))) out <- matches[!is.na(matches)] } else { new_expr <- tryCatch( .dynGet(x, inherits = FALSE, minframe = 0L), error = function(e) { # if starts_with() et al. don't exist fn <- insight::safe_deparse(e$call) # if starts_with() et al. come from tidyselect but need to be used in # a select environment, then the error doesn't have the same structure. if ( is.null(fn) && grepl("must be used within a", e$message, fixed = TRUE) ) { call_trace <- lapply(e$trace$call, function(x) { tmp <- insight::safe_deparse(x) if (grepl(paste0("^", .regex_select_helper()), tmp)) { tmp } }) fn <- Filter(Negate(is.null), call_trace)[1] } # if we actually obtain the select helper call, return it, else return # what we already had if (length(fn) > 0L && grepl(.regex_select_helper(), fn)) { is_select_helper <<- TRUE return(fn) } NULL } ) # when "x" is a function arg which is itself a function call to evaluate, # .dynGet can return "x" infinitely so we try to evaluate this arg # see #414 if (!is.null(new_expr) && insight::safe_deparse(new_expr) == "x") { new_expr <- .dynEval( x, inherits = FALSE, minframe = 0L, remove_n_top_env = 4 ) } if (is_select_helper) { new_expr <- str2lang(unlist(new_expr, use.names = FALSE)) out <- .eval_expr( new_expr, data = data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) } else if (length(new_expr) == 1L && is.function(new_expr)) { out <- which(vapply(data, new_expr, FUN.VALUE = logical(1L))) } else { out <- unlist( lapply( new_expr, .eval_expr, data = data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ), use.names = FALSE ) } } # sometimes an object that needs to be evaluated has the same name as a # function (e.g `colnames`). Vector of names have the priority on functions # so function evaluation is delayed at the max. if (is.null(out) && is.function(try_eval)) { cols <- names(data) out <- which(vapply(data, x, FUN.VALUE = logical(1L))) } out } # Dispatch expressions to various select helpers according to the function call. .eval_call <- function(data, x, ignore_case, regex, verbose, ifnotfound) { type <- insight::safe_deparse(x[[1]]) switch( type, `:` = .select_seq(x, data, ignore_case, regex, verbose, ifnotfound), `-` = .select_minus(x, data, ignore_case, regex, verbose, ifnotfound), `c` = .select_c(x, data, ignore_case, regex, verbose, ifnotfound), # nolint `(` = .select_bracket(x, data, ignore_case, regex, verbose, ifnotfound), `[` = .select_square_bracket( x, data, ignore_case, regex, verbose, ifnotfound ), `$` = .select_dollar(x, data, ignore_case, regex, verbose, ifnotfound), `~` = .select_tilde(x, data, ignore_case, regex, verbose, ifnotfound), list = .select_list(x, data, ignore_case, regex, verbose, ifnotfound), names = .select_names(x, data, ignore_case, regex, verbose, ifnotfound), starts_with = , ends_with = , matches = , contains = , regex = .select_helper(x, data, ignore_case, regex, verbose, ifnotfound), .select_context(x, data, ignore_case, regex, verbose, ifnotfound) ) } # e.g 1:3, or gear:cyl .select_seq <- function(expr, data, ignore_case, regex, verbose, ifnotfound) { x <- .eval_expr( expr[[2]], data = data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) y <- .eval_expr( expr[[3]], data = data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) x:y } # e.g -cyl .select_minus <- function(expr, data, ignore_case, regex, verbose, ifnotfound) { x <- .eval_expr( expr[[2]], data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) if (length(x) == 0L) { seq_along(data) } else { x * -1L } } # e.g c("gear", "cyl") .select_c <- function(expr, data, ignore_case, regex, verbose, ifnotfound) { lst_expr <- as.list(expr) lst_expr[[1]] <- NULL unlist( lapply( lst_expr, .eval_expr, data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ), use.names = FALSE ) } # e.g -(gear:cyl) .select_bracket <- function( expr, data, ignore_case, regex, verbose, ifnotfound ) { .eval_expr( expr[[2]], data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) } # e.g myvector[3] .select_square_bracket <- function( expr, data, ignore_case, regex, verbose, ifnotfound ) { first_obj <- .eval_expr( expr[[2]], data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) .eval_expr( first_obj[eval(expr[[3]])], data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) } .select_names <- function(expr, data, ignore_case, regex, verbose, ifnotfound) { first_obj <- .dynEval(expr, inherits = FALSE, minframe = 0L) .eval_expr( first_obj, data, ignore_case = ignore_case, regex = regex, verbose = FALSE, ifnotfound = ifnotfound ) } # e.g starts_with("Sep") .select_helper <- function( expr, data, ignore_case, regex, verbose, ifnotfound ) { lst_expr <- as.list(expr) # need this if condition to distinguish between starts_with("Sep") (that we # can use directly) and starts_with(i) (where we need to get i) if (length(lst_expr) == 2L && typeof(lst_expr[[2]]) == "symbol") { collapsed_patterns <- .dynGet( lst_expr[[2]], inherits = FALSE, minframe = 0L ) } else { collapsed_patterns <- paste( unlist(lst_expr[2:length(lst_expr)]), collapse = "|" ) } helper <- insight::safe_deparse(lst_expr[[1]]) rgx <- switch( helper, starts_with = paste0("^(", collapsed_patterns, ")"), ends_with = paste0("(", collapsed_patterns, ")$"), contains = paste0("(", collapsed_patterns, ")"), regex = collapsed_patterns, insight::format_error("There is no select helper called '", helper, "'.") ) # starting in R 4.5, grep() errors if some logical args have NULL/NA if (is.null(ignore_case)) { ignore_case <- FALSE } grep(rgx, colnames(data), ignore.case = ignore_case) } # e.g args$select (happens when we use grouped_data (see center.grouped_df())) .select_dollar <- function( expr, data, ignore_case, regex, verbose, ifnotfound ) { first_obj <- .dynGet( expr[[2]], ifnotfound = NULL, inherits = FALSE, minframe = 0L ) if (is.null(first_obj)) { first_obj <- .dynEval(expr[[2]], inherits = FALSE, minframe = 0L) } .eval_expr( first_obj[[deparse(expr[[3]])]], data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) } # e.g ~ gear + cyl .select_tilde <- function(expr, data, ignore_case, regex, verbose, ifnotfound) { vars <- all.vars(expr) unlist( lapply( vars, .eval_expr, data = data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ), use.names = FALSE ) } # e.g list(gear = 4, cyl = 5) .select_list <- function(expr, data, ignore_case, regex, verbose, ifnotfound) { vars <- names(.dynEval(expr, inherits = FALSE, minframe = 0L)) unlist( lapply( vars, .eval_expr, data = data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ), use.names = FALSE ) } # e.g is.numeric() .select_context <- function( expr, data, ignore_case, regex, verbose, ifnotfound ) { x_dep <- insight::safe_deparse(expr) if (endsWith(x_dep, "()")) { new_expr <- gsub("\\(\\)$", "", x_dep) new_expr <- str2lang(new_expr) .eval_expr( new_expr, data = data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) } else { out <- .dynEval(expr, inherits = FALSE, minframe = 0L) .eval_expr( out, data = data, ignore_case = ignore_case, regex = regex, verbose = verbose, ifnotfound = ifnotfound ) } } # ------------------------------------- .check_data <- function(data) { if (is.null(data)) { insight::format_error("The `data` argument must be provided.") } .coerce_to_dataframe(data) } .regex_select_helper <- function() { "(starts\\_with|ends\\_with|col\\_ends\\_with|contains|regex)" } .conflicting_packages <- function(packages = NULL) { if (is.null(packages)) { packages <- "poorman" } namespace <- vapply(packages, isNamespaceLoaded, FUN.VALUE = logical(1L)) attached <- paste0("package:", packages) %in% search() attached <- stats::setNames(attached, packages) for (i in packages) { unloadNamespace(i) } list(package = packages, namespace = namespace, attached = attached) } .attach_packages <- function(packages = NULL) { if (!is.null(packages)) { pkg <- packages$package for (i in seq_along(pkg)) { if (isTRUE(packages$namespace[i])) { loadNamespace(pkg[i]) } if (isTRUE(packages$attached[i])) { suppressPackageStartupMessages( suppressWarnings( require(pkg[i], quietly = TRUE, character.only = TRUE) ) ) } } } } # Almost identical to dynGet(). The difference is that we deparse the expression # because get0() allows symbol only since R 4.1.0 .dynGet <- function( x, ifnotfound = stop( gettextf("%s not found", sQuote(x)), domain = NA, call. = FALSE ), minframe = 1L, inherits = FALSE ) { x <- insight::safe_deparse(x) n <- sys.nframe() myObj <- structure(list(.b = as.raw(7)), foo = 47L) while (n > minframe) { n <- n - 1L env <- sys.frame(n) r <- get0(x, envir = env, inherits = inherits, ifnotfound = myObj) if (!identical(r, myObj)) { return(r) } } ifnotfound } # Similar to .dynGet() but instead of getting an object from the environment, # we try to evaluate an expression. It stops as soon as the evaluation doesn't # error. Returns NULL if can never be evaluated. # # Custom arg "remove_n_top_env" to remove the first environments which are # ".select_nse()" and the other custom functions. # # Arg "data" is here if we want to start searching in the data instead of the # lowest environment. .dynEval <- function( x, ifnotfound = stop( gettextf("%s not found", sQuote(x)), domain = NA, call. = FALSE ), minframe = 1L, inherits = FALSE, remove_n_top_env = 0, data = NULL ) { iter <- 0 n <- sys.nframe() - remove_n_top_env x <- insight::safe_deparse(x) while (n > minframe) { if (iter == 0 && !is.null(data)) { env <- data iter <- iter + 1 } else { n <- n - 1L env <- sys.frame(n) } # We use `tryCatch()` instead of `try()` here, because for grouped data frame # methods, `.dynEval()` can be called many times. Since `tryCatch()` is minimal # faster than `try()`, we get a performance "boost" of some seconds for large # data frames with many groups (see https://github.com/easystats/datawizard/pull/657/) r <- tryCatch(eval(str2lang(x), envir = env), error = function(e) NULL) if (!is.null(r)) { return(r) } } ifnotfound } ================================================ FILE: R/skewness_kurtosis.R ================================================ #' Compute Skewness and (Excess) Kurtosis #' #' @param x A numeric vector or data.frame. #' @param type Type of algorithm for computing skewness. May be one of `1` #' (or `"1"`, `"I"` or `"classic"`), `2` (or `"2"`, #' `"II"` or `"SPSS"` or `"SAS"`) or `3` (or `"3"`, #' `"III"` or `"Minitab"`). See 'Details'. #' @param iterations The number of bootstrap replicates for computing standard #' errors. If `NULL` (default), parametric standard errors are computed. #' @param test Logical, if `TRUE`, tests if skewness or kurtosis is #' significantly different from zero. #' @param digits Number of decimal places. #' @param object An object returned by `skewness()` or `kurtosis()`. #' @param verbose Toggle warnings and messages. #' @param ... Arguments passed to or from other methods. #' @inheritParams coef_var #' #' @details #' #' \subsection{Skewness}{ #' Symmetric distributions have a `skewness` around zero, while #' a negative skewness values indicates a "left-skewed" distribution, and a #' positive skewness values indicates a "right-skewed" distribution. Examples #' for the relationship of skewness and distributions are: #' #' - Normal distribution (and other symmetric distribution) has a skewness #' of 0 #' - Half-normal distribution has a skewness just below 1 #' - Exponential distribution has a skewness of 2 #' - Lognormal distribution can have a skewness of any positive value, #' depending on its parameters #' #' (\cite{https://en.wikipedia.org/wiki/Skewness}) #' } #' #' \subsection{Types of Skewness}{ #' `skewness()` supports three different methods for estimating skewness, #' as discussed in \cite{Joanes and Gill (1988)}: #' #' - Type "1" is the "classical" method, which is `g1 = (sum((x - #' mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5` #' #' - Type "2" first calculates the type-1 skewness, then adjusts the result: #' `G1 = g1 * sqrt(n * (n - 1)) / (n - 2)`. This is what SAS and SPSS #' usually return. #' #' - Type "3" first calculates the type-1 skewness, then adjusts the result: #' `b1 = g1 * ((1 - 1 / n))^1.5`. This is what Minitab usually returns. #' } #' #' \subsection{Kurtosis}{ #' The `kurtosis` is a measure of "tailedness" of a distribution. A #' distribution with a kurtosis values of about zero is called "mesokurtic". A #' kurtosis value larger than zero indicates a "leptokurtic" distribution with #' *fatter* tails. A kurtosis value below zero indicates a "platykurtic" #' distribution with *thinner* tails #' (\cite{https://en.wikipedia.org/wiki/Kurtosis}). #' } #' #' \subsection{Types of Kurtosis}{ #' `kurtosis()` supports three different methods for estimating kurtosis, #' as discussed in \cite{Joanes and Gill (1988)}: #' #' - Type "1" is the "classical" method, which is `g2 = n * sum((x - #' mean(x))^4) / (sum((x - mean(x))^2)^2) - 3`. #' #' - Type "2" first calculates the type-1 kurtosis, then adjusts the result: #' `G2 = ((n + 1) * g2 + 6) * (n - 1)/((n - 2) * (n - 3))`. This is what #' SAS and SPSS usually return #' #' - Type "3" first calculates the type-1 kurtosis, then adjusts the result: #' `b2 = (g2 + 3) * (1 - 1 / n)^2 - 3`. This is what Minitab usually #' returns. #' #' } #' #' \subsection{Standard Errors}{ #' It is recommended to compute empirical (bootstrapped) standard errors (via #' the `iterations` argument) than relying on analytic standard errors #' (\cite{Wright & Herrington, 2011}). #' } #' #' @references #' #' - D. N. Joanes and C. A. Gill (1998). Comparing measures of sample #' skewness and kurtosis. The Statistician, 47, 183–189. #' #' - Wright, D. B., & Herrington, J. A. (2011). Problematic standard #' errors and confidence intervals for skewness and kurtosis. Behavior #' research methods, 43(1), 8-17. #' #' @return Values of skewness or kurtosis. #' #' @examples #' skewness(rnorm(1000)) #' kurtosis(rnorm(1000)) #' @export skewness <- function(x, ...) { UseMethod("skewness") } # skewness ----------------------------------------- #' @rdname skewness #' @export skewness.numeric <- function( x, remove_na = TRUE, type = "2", iterations = NULL, verbose = TRUE, ... ) { if (remove_na) { x <- x[!is.na(x)] } n <- length(x) out <- (sum((x - mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5 type <- .check_skewness_type(type) if (type == "2" && n < 3) { if (verbose) { insight::format_warning( "Need at least 3 complete observations for type-2-skewness. Using 'type=\"1\"' now." ) } type <- "1" } .skewness <- switch( type, "1" = out, "2" = out * sqrt(n * (n - 1)) / (n - 2), "3" = out * ((1 - 1 / n))^1.5 ) out_se <- sqrt((6 * (n - 2)) / ((n + 1) * (n + 3))) .skewness_se <- switch( type, "1" = out_se, "2" = out_se * ((sqrt(n * (n - 1))) / (n - 2)), "3" = out_se * (((n - 1) / n)^1.5), ) if (!is.null(iterations)) { if (requireNamespace("boot", quietly = TRUE)) { results <- boot::boot( data = x, statistic = .boot_skewness, R = iterations, remove_na = remove_na, type = type ) out_se <- stats::sd(results$t, na.rm = TRUE) } else { insight::format_warning("Package 'boot' needed for bootstrapping SEs.") } } .skewness <- data.frame( Skewness = .skewness, SE = out_se ) class(.skewness) <- unique(c("parameters_skewness", class(.skewness))) .skewness } #' @export skewness.matrix <- function( x, remove_na = TRUE, type = "2", iterations = NULL, ... ) { .skewness <- apply( x, 2, skewness, remove_na = remove_na, type = type, iterations = iterations ) .names <- colnames(x) if (length(.names) == 0) { .names <- paste0("X", seq_len(ncol(x))) } .skewness <- cbind(Parameter = .names, do.call(rbind, .skewness)) class(.skewness) <- unique(c("parameters_skewness", class(.skewness))) .skewness } #' @export skewness.data.frame <- function( x, remove_na = TRUE, type = "2", iterations = NULL, ... ) { .skewness <- lapply( x, skewness, remove_na = remove_na, type = type, iterations = iterations ) .skewness <- cbind(Parameter = names(.skewness), do.call(rbind, .skewness)) class(.skewness) <- unique(c("parameters_skewness", class(.skewness))) .skewness } #' @export skewness.default <- function( x, remove_na = TRUE, type = "2", iterations = NULL, ... ) { skewness( .factor_to_numeric(x), remove_na = remove_na, type = type, iterations = iterations ) } # Kurtosis ----------------------------------- #' @rdname skewness #' @export kurtosis <- function(x, ...) { UseMethod("kurtosis") } #' @rdname skewness #' @export kurtosis.numeric <- function( x, remove_na = TRUE, type = "2", iterations = NULL, verbose = TRUE, ... ) { if (remove_na) { x <- x[!is.na(x)] } n <- length(x) out <- n * sum((x - mean(x))^4) / (sum((x - mean(x))^2)^2) type <- .check_skewness_type(type) if (type == "2" && n < 4) { if (verbose) { insight::format_warning( "Need at least 4 complete observations for type-2-kurtosis Using 'type=\"1\"' now." ) } type <- "1" } .kurtosis <- switch( type, "1" = out - 3, "2" = ((n + 1) * (out - 3) + 6) * (n - 1) / ((n - 2) * (n - 3)), "3" = out * (1 - 1 / n)^2 - 3 ) out_se <- sqrt( (24 * n * (n - 2) * (n - 3)) / (((n + 1)^2) * (n + 3) * (n + 5)) ) .kurtosis_se <- switch( type, "1" = out_se, "2" = out_se * (((n - 1) * (n + 1)) / ((n - 2) * (n - 3))), "3" = out_se * ((n - 1) / n)^2 ) if (!is.null(iterations)) { insight::check_if_installed("boot") results <- boot::boot( data = x, statistic = .boot_kurtosis, R = iterations, remove_na = remove_na, type = type ) out_se <- stats::sd(results$t, na.rm = TRUE) } .kurtosis <- data.frame( Kurtosis = .kurtosis, SE = out_se ) class(.kurtosis) <- unique(c("parameters_kurtosis", class(.kurtosis))) .kurtosis } #' @export kurtosis.matrix <- function( x, remove_na = TRUE, type = "2", iterations = NULL, ... ) { .kurtosis <- apply( x, 2, kurtosis, remove_na = remove_na, type = type, iterations = iterations ) .names <- colnames(x) if (length(.names) == 0) { .names <- paste0("X", seq_len(ncol(x))) } .kurtosis <- cbind(Parameter = .names, do.call(rbind, .kurtosis)) class(.kurtosis) <- unique(c("parameters_kurtosis", class(.kurtosis))) .kurtosis } #' @export kurtosis.data.frame <- function( x, remove_na = TRUE, type = "2", iterations = NULL, ... ) { .kurtosis <- lapply( x, kurtosis, remove_na = remove_na, type = type, iterations = iterations ) .kurtosis <- cbind(Parameter = names(.kurtosis), do.call(rbind, .kurtosis)) class(.kurtosis) <- unique(c("parameters_kurtosis", class(.kurtosis))) .kurtosis } #' @export kurtosis.default <- function( x, remove_na = TRUE, type = "2", iterations = NULL, ... ) { kurtosis( .factor_to_numeric(x), remove_na = remove_na, type = type, iterations = iterations ) } # methods ----------------------------------------- #' @export as.numeric.parameters_kurtosis <- function(x, ...) { x$Kurtosis } #' @export as.numeric.parameters_skewness <- function(x, ...) { x$Skewness } #' @export as.double.parameters_kurtosis <- as.numeric.parameters_kurtosis #' @export as.double.parameters_skewness <- as.numeric.parameters_skewness #' @rdname skewness #' @export print.parameters_kurtosis <- function(x, digits = 3, test = FALSE, ...) { out <- summary(x, test = test) cat(insight::export_table(out, digits = digits)) invisible(x) } #' @rdname skewness #' @export print.parameters_skewness <- print.parameters_kurtosis #' @rdname skewness #' @export summary.parameters_skewness <- function(object, test = FALSE, ...) { if (test) { object$z <- object$Skewness / object$SE object$p <- 2 * (1 - stats::pnorm(abs(object$z))) } object } #' @rdname skewness #' @export summary.parameters_kurtosis <- function(object, test = FALSE, ...) { if (test) { object$z <- object$Kurtosis / object$SE object$p <- 2 * (1 - stats::pnorm(abs(object$z))) } object } # helper ------------------------------------------ .check_skewness_type <- function(type) { # convenience if (is.numeric(type)) { type <- as.character(type) } skewness_types <- c( "1", "2", "3", "I", "II", "III", "classic", "SPSS", "SAS", "Minitab" ) is_skewness_type_invalid <- is.null(type) || is.na(type) || !(type %in% skewness_types) if (is_skewness_type_invalid) { insight::format_warning( "'type' must be a character value from \"1\" to \"3\". Using 'type=\"2\"' now." ) type <- "2" } switch( type, `1` = , I = , classic = "1", `2` = , II = , SPSS = , SAS = "2", `3` = , III = , Minitab = "3" ) } # bootstrapping ----------------------------------- .boot_skewness <- function(data, indices, remove_na, type) { datawizard::skewness( data[indices], remove_na = remove_na, type = type, iterations = NULL )$Skewness } .boot_kurtosis <- function(data, indices, remove_na, type) { datawizard::kurtosis( data[indices], remove_na = remove_na, type = type, iterations = NULL )$Kurtosis } ================================================ FILE: R/slide.R ================================================ #' @title Shift numeric value range #' @name slide #' #' @description #' This functions shifts the value range of a numeric variable, so that the #' new range starts at a given value. #' #' @param x A data frame or numeric vector. #' @param verbose Toggle warnings. #' @param ... not used. #' @inheritParams to_numeric #' #' @return `x`, where the range of numeric variables starts at a new value. #' #' @inheritSection center Selection of variables - the `select` argument #' #' @inherit data_rename seealso #' #' @examples #' # numeric #' head(mtcars$gear) #' head(slide(mtcars$gear)) #' head(slide(mtcars$gear, lowest = 10)) #' #' # data frame #' sapply(slide(mtcars, lowest = 1), min) #' sapply(mtcars, min) #' @export slide <- function(x, ...) { UseMethod("slide") } #' @export slide.default <- function(x, lowest = 0, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( "Shifting non-numeric variables is not possible.", "Try using 'to_numeric()' and specify the 'lowest' argument." ) } x } #' @rdname slide #' @export slide.numeric <- function(x, lowest = 0, ...) { original_x <- x minval <- min(x, na.rm = TRUE) difference <- minval - lowest x <- x - difference .set_back_labels(x, original_x, include_values = FALSE) } #' @rdname slide #' @export slide.data.frame <- function( x, select = NULL, exclude = NULL, lowest = 0, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments my_args <- .process_append( x, select, append, append_suffix = "_s", keep_factors = FALSE ) # update processed arguments x <- my_args$x select <- my_args$select } x[select] <- lapply( x[select], slide, lowest = lowest, verbose = verbose, ... ) x } ================================================ FILE: R/smoothness.R ================================================ #' Quantify the smoothness of a vector #' #' @param x Numeric vector (similar to a time series). #' @param method Can be `"diff"` (the standard deviation of the standardized #' differences) or `"cor"` (default, lag-one autocorrelation). #' @param lag An integer indicating which lag to use. If less than `1`, will be #' interpreted as expressed in percentage of the length of the vector. #' @inheritParams skewness #' #' @examples #' x <- (-10:10)^3 + rnorm(21, 0, 100) #' plot(x) #' smoothness(x, method = "cor") #' smoothness(x, method = "diff") #' @return Value of smoothness. #' @references https://stats.stackexchange.com/questions/24607/how-to-measure-smoothness-of-a-time-series-in-r #' #' @export smoothness <- function(x, method = "cor", lag = 1, iterations = NULL, ...) { UseMethod("smoothness") } #' @export smoothness.numeric <- function( x, method = "cor", lag = 1, iterations = NULL, ... ) { if (lag < 1) { lag <- round(lag * length(x)) } if (lag <= 0) { insight::format_error("'lag' cannot be that small.") } if (method == "cor") { smooth_data <- stats::cor( utils::head(x, length(x) - lag), utils::tail(x, length(x) - lag) ) } else { smooth_data <- stats::sd(diff(x, lag = lag)) / abs(mean(diff(x, lag = lag))) } if (!is.null(iterations)) { if (requireNamespace("boot", quietly = TRUE)) { results <- boot::boot( data = x, statistic = .boot_smoothness, R = iterations, method = method, lag = lag ) out_se <- stats::sd(results$t, na.rm = TRUE) smooth_data <- data.frame(Smoothness = smooth_data, SE = out_se) } else { insight::format_warning("Package 'boot' needed for bootstrapping SEs.") } } class(smooth_data) <- unique(c("parameters_smoothness", class(smooth_data))) smooth_data } #' @export smoothness.data.frame <- function( x, method = "cor", lag = 1, iterations = NULL, ... ) { .smoothness <- lapply( x, smoothness, method = method, lag = lag, iterations = iterations ) .smoothness <- cbind( Parameter = names(.smoothness), do.call(rbind, .smoothness) ) class(.smoothness) <- unique(c("parameters_smoothness", class(.smoothness))) .smoothness } #' @export smoothness.default <- function( x, method = "cor", lag = 1, iterations = NULL, ... ) { smoothness( .factor_to_numeric(x), method = method, lag = lag, iterations = iterations ) } # bootstrapping ----------------------------------- .boot_smoothness <- function(data, indices, method, lag) { datawizard::smoothness( x = data[indices], method = method, lag = lag, iterations = NULL ) } # methods ----------------------------------------- #' @export as.numeric.parameters_smoothness <- function(x, ...) { if (is.data.frame(x)) { x$Smoothness } else { as.vector(x) } } #' @export as.double.parameters_smoothness <- as.numeric.parameters_smoothness ================================================ FILE: R/standardize.R ================================================ #' Standardization (Z-scoring) #' #' Performs a standardization of data (z-scoring), i.e., centering and scaling, #' so that the data is expressed in terms of standard deviation (i.e., mean = 0, #' SD = 1) or Median Absolute Deviance (median = 0, MAD = 1). When applied to a #' statistical model, this function extracts the dataset, standardizes it, and #' refits the model with this standardized version of the dataset. The #' [normalize()] function can also be used to scale all numeric variables within #' the 0 - 1 range. #' \cr\cr #' For model standardization, see [`standardize.default()`]. #' #' @param x A (grouped) data frame, a vector or a statistical model (for #' `unstandardize()` cannot be a model). #' @param robust Logical, if `TRUE`, centering is done by subtracting the #' median from the variables and dividing it by the median absolute deviation #' (MAD). If `FALSE`, variables are standardized by subtracting the #' mean and dividing it by the standard deviation (SD). #' @param two_sd If `TRUE`, the variables are scaled by two times the deviation #' (SD or MAD depending on `robust`). This method can be useful to obtain #' model coefficients of continuous parameters comparable to coefficients #' related to binary predictors, when applied to **the predictors** (not the #' outcome) (Gelman, 2008). #' @param weights Can be `NULL` (for no weighting), or: #' - For model: if `TRUE` (default), a weighted-standardization is carried out. #' - For `data.frame`s: a numeric vector of weights, or a character of the #' name of a column in the `data.frame` that contains the weights. #' - For numeric vectors: a numeric vector of weights. #' @param verbose Toggle warnings and messages on or off. #' @param remove_na How should missing values (`NA`) be treated: if `"none"` #' (default): each column's standardization is done separately, ignoring #' `NA`s. Else, rows with `NA` in the columns selected with `select` / #' `exclude` (`"selected"`) or in all columns (`"all"`) are dropped before #' standardization, and the resulting data frame does not include these cases. #' @param force Logical, if `TRUE`, forces standardization of factors and dates #' as well. Factors are converted to numerical values, with the lowest level #' being the value `1` (unless the factor has numeric levels, which are #' converted to the corresponding numeric value). #' @param append Logical or string. If `TRUE`, standardized variables get new #' column names (with the suffix `"_z"`) and are appended (column bind) to `x`, #' thus returning both the original and the standardized variables. If `FALSE`, #' original variables in `x` will be overwritten by their standardized versions. #' If a character value, standardized variables are appended with new column #' names (using the defined suffix) to the original data frame. #' @param reference A data frame or variable from which the centrality and #' deviation will be computed instead of from the input variable. Useful for #' standardizing a subset or new data according to another data frame. #' @param center,scale #' * For `standardize()`: \cr #' Numeric values, which can be used as alternative to `reference` to define #' a reference centrality and deviation. If `scale` and `center` are of #' length 1, they will be recycled to match the length of selected variables #' for standardization. Else, `center` and `scale` must be of same length as #' the number of selected variables. Values in `center` and `scale` will be #' matched to selected variables in the provided order, unless a named vector #' is given. In this case, names are matched against the names of the selected #' variables. #' #' * For `unstandardize()`: \cr #' `center` and `scale` correspond to the center (the mean / median) and the scale (SD / MAD) of #' the original non-standardized data (for data frames, should be named, or #' have column order correspond to the numeric column). However, one can also #' directly provide the original data through `reference`, from which the #' center and the scale will be computed (according to `robust` and `two_sd`). #' Alternatively, if the input contains the attributes `center` and `scale` #' (as does the output of `standardize()`), it will take it from there if the #' rest of the arguments are absent. #' @param force Logical, if `TRUE`, forces recoding of factors and character #' vectors as well. #' @param ... Arguments passed to or from other methods. #' @inheritParams extract_column_names #' #' @inheritSection center Selection of variables - the `select` argument #' #' @return The standardized object (either a standardize data frame or a #' statistical model fitted on standardized data). #' #' @note When `x` is a vector or a data frame with `remove_na = "none")`, #' missing values are preserved, so the return value has the same length / #' number of rows as the original input. #' #' @seealso See [center()] for grand-mean centering of variables, and #' [makepredictcall.dw_transformer()] for use in model formulas. #' #' @family transform utilities #' @family standardize #' #' @examples #' d <- iris[1:4, ] #' #' # vectors #' standardise(d$Petal.Length) #' #' # Data frames #' # overwrite #' standardise(d, select = c("Sepal.Length", "Sepal.Width")) #' #' # append #' standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = TRUE) #' #' # append, suffix #' standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = "_std") #' #' # standardizing with reference center and scale #' d <- data.frame( #' a = c(-2, -1, 0, 1, 2), #' b = c(3, 4, 5, 6, 7) #' ) #' #' # default standardization, based on mean and sd of each variable #' standardize(d) # means are 0 and 5, sd ~ 1.581139 #' #' # standardization, based on mean and sd set to the same values #' standardize(d, center = c(0, 5), scale = c(1.581, 1.581)) #' #' # standardization, mean and sd for each variable newly defined #' standardize(d, center = c(3, 4), scale = c(2, 4)) #' #' # standardization, taking same mean and sd for each variable #' standardize(d, center = 1, scale = 3) #' @export standardize <- function(x, ...) { UseMethod("standardize") } #' @rdname standardize #' @export standardise <- standardize # Default method is in effectsize # standardize.default <- function(x, verbose = TRUE, ...) { # if (isTRUE(verbose)) { # insight::format_alert(sprintf("Standardizing currently not possible for variables of class '%s'.", class(x)[1]))) # } # x # } #' @rdname standardize #' @export standardize.numeric <- function( x, robust = FALSE, two_sd = FALSE, weights = NULL, reference = NULL, center = NULL, scale = NULL, verbose = TRUE, ... ) { # set default - need to fix this, else we don't know whether this # comes from "center()" or "standardize()". Furthermore, data.frame # methods cannot return a vector of NULLs for each variable - instead # they return NA. Thus, we have to treat NA like NULL if (is.null(scale) || is.na(scale)) { scale <- TRUE } if (is.null(center) || is.na(center)) { center <- TRUE } my_args <- .process_std_center( x, weights, robust, verbose, reference, center, scale ) dot_args <- list(...) # Perform standardization if (is.null(my_args)) { # all NA? return(x) } else if (is.null(my_args$check)) { vals <- rep(0, length(my_args$vals)) # If only unique value } else if (two_sd) { vals <- as.vector((my_args$vals - my_args$center) / (2 * my_args$scale)) } else { vals <- as.vector((my_args$vals - my_args$center) / my_args$scale) } scaled_x <- rep(NA, length(my_args$valid_x)) scaled_x[my_args$valid_x] <- vals attr(scaled_x, "center") <- my_args$center attr(scaled_x, "scale") <- my_args$scale attr(scaled_x, "robust") <- robust # labels z <- .set_back_labels(scaled_x, x, include_values = FALSE) if (!isFALSE(dot_args$add_transform_class)) { class(z) <- c("dw_transformer", class(z)) } z } #' @export standardize.double <- standardize.numeric #' @export standardize.integer <- standardize.numeric #' @export standardize.matrix <- function(x, ...) { xl <- lapply(seq_len(ncol(x)), function(i) x[, i]) xz <- lapply(xl, datawizard::standardize, ...) x_out <- do.call(cbind, xz) dimnames(x_out) <- dimnames(x) attr(x_out, "center") <- vapply(xz, attr, "center", FUN.VALUE = numeric(1L)) attr(x_out, "scale") <- vapply(xz, attr, "scale", FUN.VALUE = numeric(1L)) attr(x_out, "robust") <- vapply(xz, attr, "robust", FUN.VALUE = logical(1L))[ 1 ] class(x_out) <- c("dw_transformer", class(x_out)) x_out } #' @rdname standardize #' @export standardize.factor <- function( x, robust = FALSE, two_sd = FALSE, weights = NULL, force = FALSE, verbose = TRUE, ... ) { if (!force) { return(x) } standardize( .factor_to_numeric(x), robust = robust, two_sd = two_sd, weights = weights, verbose = verbose, ... ) } #' @export standardize.character <- standardize.factor #' @export standardize.logical <- standardize.factor #' @export standardize.Date <- standardize.factor #' @export standardize.AsIs <- standardize.numeric # Data frames ------------------------------------------------------------- #' @rdname standardize #' @export standardize.data.frame <- function( x, select = NULL, exclude = NULL, robust = FALSE, two_sd = FALSE, weights = NULL, reference = NULL, center = NULL, scale = NULL, remove_na = c("none", "selected", "all"), force = FALSE, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # process arguments my_args <- .process_std_args( x, select, exclude, weights, append, append_suffix = "_z", keep_factors = force, remove_na, reference, .center = center, .scale = scale ) # set new values x <- my_args$x # Loop through variables and standardize it for (var in my_args$select) { x[[var]] <- standardize( x[[var]], robust = robust, two_sd = two_sd, weights = my_args$weights, reference = reference[[var]], center = my_args$center[var], scale = my_args$scale[var], verbose = FALSE, force = force, add_transform_class = FALSE ) } attr(x, "center") <- unlist(lapply(x[my_args$select], function(z) { attributes(z)$center })) attr(x, "scale") <- unlist(lapply(x[my_args$select], function(z) { attributes(z)$scale })) attr(x, "robust") <- robust x } #' @export standardize.grouped_df <- function( x, select = NULL, exclude = NULL, robust = FALSE, two_sd = FALSE, weights = NULL, reference = NULL, center = NULL, scale = NULL, remove_na = c("none", "selected", "all"), force = FALSE, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) my_args <- .process_grouped_df( x, select, exclude, append, append_suffix = "_z", reference, weights, keep_factors = force ) # create column(s) to store dw_transformer attributes for (i in select) { my_args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(my_args$grps)) } for (rows in seq_along(my_args$grps)) { tmp <- standardize( my_args$x[my_args$grps[[rows]], , drop = FALSE], select = my_args$select, exclude = NULL, robust = robust, two_sd = two_sd, weights = my_args$weights, remove_na = remove_na, verbose = verbose, force = force, append = FALSE, center = center, scale = scale, add_transform_class = FALSE, ... ) # store dw_transformer_attributes for (i in select) { my_args$info$groups[rows, paste0("attr_", i)][[ 1 ]] <- list(unlist(attributes(tmp[[i]]))) } my_args$x[my_args$grps[[rows]], ] <- tmp } # last column of "groups" attributes must be called ".rows" my_args$info$groups <- data_relocate(my_args$info$groups, ".rows", after = -1) # set back class, so data frame still works with dplyr attributes(my_args$x) <- my_args$info my_args$x } # Datagrid ---------------------------------------------------------------- #' @export standardize.datagrid <- function(x, ...) { x[names(x)] <- standardize( as.data.frame(x), reference = attributes(x)$data, ... ) x } #' @export standardize.visualisation_matrix <- standardize.datagrid ================================================ FILE: R/standardize.models.R ================================================ #' Re-fit a model with standardized data #' #' Performs a standardization of data (z-scoring) using #' [`standardize()`] and then re-fits the model to the standardized data. #' \cr\cr #' Standardization is done by completely refitting the model on the standardized #' data. Hence, this approach is equal to standardizing the variables *before* #' fitting the model and will return a new model object. This method is #' particularly recommended for complex models that include interactions or #' transformations (e.g., polynomial or spline terms). The `robust` (default to #' `FALSE`) argument enables a robust standardization of data, based on the #' `median` and the `MAD` instead of the `mean` and the `SD`. #' #' @param x A statistical model. #' @param weights If `TRUE` (default), a weighted-standardization is carried out. #' @param include_response If `TRUE` (default), the response value will also be #' standardized. If `FALSE`, only the predictors will be standardized. #' - Note that for GLMs and models with non-linear link functions, the #' response value will not be standardized, to make re-fitting the model work. #' - If the model contains an [stats::offset()], the offset variable(s) will #' be standardized only if the response is standardized. If `two_sd = TRUE`, #' offsets are standardized by one-sd (similar to the response). #' - (For `mediate` models, the `include_response` refers to the outcome in #' the y model; m model's response will always be standardized when possible). #' @inheritParams standardize #' #' @return A statistical model fitted on standardized data #' #' @details #' #' # Generalized Linear Models #' Standardization for generalized linear models (GLM, GLMM, etc) is done only #' with respect to the predictors (while the outcome remains as-is, #' unstandardized) - maintaining the interpretability of the coefficients (e.g., #' in a binomial model: the exponent of the standardized parameter is the OR of #' a change of 1 SD in the predictor, etc.) #' #' # Dealing with Factors #' `standardize(model)` or `standardize_parameters(model, method = "refit")` do #' *not* standardize categorical predictors (i.e. factors) / their #' dummy-variables, which may be a different behaviour compared to other R #' packages (such as **lm.beta**) or other software packages (like SPSS). To #' mimic such behaviours, either use `standardize_parameters(model, method = #' "basic")` to obtain post-hoc standardized parameters, or standardize the data #' with `standardize(data, force = TRUE)` *before* fitting the #' model. #' #' # Transformed Variables #' When the model's formula contains transformations (e.g. `y ~ exp(X)`) the #' transformation effectively takes place after standardization (e.g., #' `exp(scale(X))`). Since some transformations are undefined for none positive #' values, such as `log()` and `sqrt()`, the relevel variables are shifted (post #' standardization) by `Z - min(Z) + 1` or `Z - min(Z)` (respectively). #' #' #' @family standardize #' @examples #' model <- lm(Infant.Mortality ~ Education * Fertility, data = swiss) #' coef(standardize(model)) #' #' @export #' @aliases standardize_models standardize.default <- function( x, robust = FALSE, two_sd = FALSE, weights = TRUE, verbose = TRUE, include_response = TRUE, ... ) { if (!insight::is_model(x)) { insight::format_warning( paste0( "Objects or variables of class '", class(x)[1], "' cannot be standardized." ) ) return(x) } data_std <- NULL # needed to avoid note .standardize_models( x, robust = robust, two_sd = two_sd, weights = weights, verbose = verbose, include_response = include_response, update_expr = stats::update(x, data = data_std), ... ) } .standardize_models <- function( x, robust = FALSE, two_sd = FALSE, weights = TRUE, verbose = TRUE, include_response = TRUE, update_expr, ... ) { # check model formula. Some notations don't work when standardizing data insight::formula_ok( x, action = "error", prefix_msg = "Model cannot be standardized.", verbose = verbose ) m_info <- .get_model_info(x, ...) model_data <- insight::get_data(x, source = "mf", verbose = FALSE) if (isTRUE(attr(model_data, "is_subset"))) { insight::format_error("Cannot standardize a model fit with a 'subset = '.") } if (m_info$is_bayesian && verbose) { insight::format_warning( "Standardizing variables without adjusting priors may lead to bogus results unless priors are auto-scaled." ) } ## ---- Z the RESPONSE? ---- # 1. Some models have special responses that should not be standardized. This # includes: # - generalized linear models (counts, binomial, etc...) # - Survival models # 2. We also don't want to standardize the response when `two_sd = TRUE` - # instead we will standardize the response separately. include_response <- include_response && .safe_to_standardize_response(m_info) resp <- NULL if (!include_response || (include_response && two_sd)) { resp <- c( insight::find_response(x), insight::find_response(x, combine = FALSE) ) resp <- insight::clean_names(resp) resp <- unique(resp) } # If there's an offset, don't standardize offset OR response offsets <- insight::find_offset(x) if (length(offsets)) { if (include_response) { if (verbose) { insight::format_warning("Offset detected and will be standardized.") } if (two_sd) { # Treat offsets like responses - only standardize by 1 SD resp <- c(resp, offsets) offsets <- NULL } } else if (!include_response) { # Don't standardize offsets if not standardizing the response offsets <- NULL } } ## ---- DO NOT Z: ---- # 1. WEIGHTS: # because negative weights will cause errors in "update()" weight_variable <- insight::find_weights(x) if ( !is.null(weight_variable) && !weight_variable %in% colnames(model_data) && "(weights)" %in% colnames(model_data) ) { model_data$.missing_weight <- model_data[["(weights)"]] colnames(model_data)[ncol(model_data)] <- weight_variable weight_variable <- c(weight_variable, "(weights)") } # 2. RANDOM-GROUPS: random_group_factor <- insight::find_random( x, flatten = TRUE, split_nested = TRUE ) ## ---- SUMMARY: TO Z OR NOT TO Z? ---- dont_standardize <- c(resp, weight_variable, random_group_factor) do_standardize <- setdiff(colnames(model_data), dont_standardize) # can't std data$var variables doller_vars <- grepl("(.*)\\$(.*)", do_standardize) if (any(doller_vars)) { doller_vars <- colnames(model_data)[doller_vars] insight::format_warning( "Unable to standardize variables evaluated in the environment (i.e., not in `data`).", "The following variables will not be standardizd:", toString(doller_vars) ) do_standardize <- setdiff(do_standardize, doller_vars) dont_standardize <- c(dont_standardize, doller_vars) } if (!length(do_standardize)) { insight::format_warning("No variables could be standardized.") return(x) } ## ---- STANDARDIZE! ---- w <- insight::get_weights(x, remove_na = TRUE) data_std <- standardize( model_data[do_standardize], robust = robust, two_sd = two_sd, weights = if (weights) w, verbose = verbose ) # if two_sd, it must not affect the response! if (include_response && two_sd) { data_std[resp] <- standardize( model_data[resp], robust = robust, two_sd = FALSE, weights = if (weights) w, verbose = verbose ) dont_standardize <- setdiff(dont_standardize, resp) } # FIX LOG-SQRT VARS: # if we standardize log-terms, standardization will fail (because log of # negative value is NaN). Do some back-transformation here log_terms <- .log_terms(x, data_std) if (length(log_terms) > 0) { data_std[log_terms] <- lapply( data_std[log_terms], function(i) i - min(i, na.rm = TRUE) + 1 ) } # same for sqrt sqrt_terms <- .sqrt_terms(x, data_std) if (length(sqrt_terms) > 0) { data_std[sqrt_terms] <- lapply( data_std[sqrt_terms], function(i) i - min(i, na.rm = TRUE) ) } if (verbose && length(c(log_terms, sqrt_terms))) { insight::format_alert( "Formula contains log- or sqrt-terms.", "See help(\"standardize\") for how such terms are standardized." ) } ## ---- ADD BACK VARS THAT WHERE NOT Z ---- if (length(dont_standardize)) { remaining_columns <- intersect(colnames(model_data), dont_standardize) data_std <- cbind(model_data[, remaining_columns, drop = FALSE], data_std) } ## ---- UPDATE MODEL WITH Z DATA ---- on.exit(.update_failed()) if (isTRUE(verbose)) { model_std <- eval(substitute(update_expr)) } else { utils::capture.output({ model_std <- eval(substitute(update_expr)) }) } on.exit() # undo previous on.exit() model_std } # Special methods --------------------------------------------------------- #' @export standardize.brmsfit <- function( x, robust = FALSE, two_sd = FALSE, weights = TRUE, verbose = TRUE, include_response = TRUE, ... ) { data_std <- NULL # needed to avoid note if (insight::is_multivariate(x)) { insight::format_error( "Multivariate brmsfit models not supported.", "As an alternative: you may standardize your data (and adjust your priors), and re-fit the model." ) } .standardize_models( x, robust = robust, two_sd = two_sd, weights = weights, verbose = verbose, include_response = include_response, update_expr = stats::update(x, newdata = data_std), ... ) } #' @export standardize.mixor <- function( x, robust = FALSE, two_sd = FALSE, weights = TRUE, verbose = TRUE, include_response = TRUE, ... ) { data_std <- random_group_factor <- NULL # needed to avoid note .standardize_models( x, robust = robust, two_sd = two_sd, weights = weights, verbose = verbose, include_response = include_response, update_expr = { data_std <- data_std[ order(data_std[, random_group_factor, drop = FALSE]), ] stats::update(x, data = data_std) }, ... ) } #' @export standardize.mediate <- function( x, robust = FALSE, two_sd = FALSE, weights = TRUE, verbose = TRUE, include_response = TRUE, ... ) { # models and data y <- x$model.y m <- x$model.m y_data <- insight::get_data(y, source = "mf", verbose = FALSE) m_data <- insight::get_data(m, source = "mf", verbose = FALSE) # std models and data y_std <- standardize( y, robust = robust, two_sd = two_sd, weights = weights, verbose = verbose, include_response = include_response, ... ) m_std <- standardize( m, robust = robust, two_sd = two_sd, weights = weights, verbose = verbose, include_response = TRUE, ... ) y_data_std <- insight::get_data(y_std, source = "mf", verbose = FALSE) m_data_std <- insight::get_data(m_std, source = "mf", verbose = FALSE) # fixed values covs <- x$covariates control.value <- x$control.value treat.value <- x$treat.value if (!is.null(covs)) { covs <- mapply( .rescale_fixed_values, covs, names(covs), SIMPLIFY = FALSE, MoreArgs = list( y_data = y_data, m_data = m_data, y_data_std = y_data_std, m_data_std = m_data_std ) ) if (verbose) { insight::format_alert( "Covariates' values have been rescaled to their standardized scales." ) } } # if (is.numeric(y_data[[x$treat]]) || is.numeric(m_data[[x$treat]])) { # if (!(is.numeric(y_data[[x$treat]]) && is.numeric(m_data[[x$treat]]))) { # stop("'treat' variable is not of same type across both y and m models.", # "\nCannot consistently standardize.", call. = FALSE) # } # # temp_vals <- .rescale_fixed_values(c(control.value, treat.value), x$treat, # y_data = y_data, m_data = m_data, # y_data_std = y_data_std, m_data_std = m_data_std) # # control.value <- temp_vals[1] # treat.value <- temp_vals[2] # if (verbose) insight::format_alert("control and treatment values have been # rescaled to their standardized scales.") # } if (verbose && !all(c(control.value, treat.value) %in% c(0, 1))) { insight::format_warning( "Control and treat values are not 0 and 1, and have not been re-scaled.", "Interpret results with caution." ) } junk <- utils::capture.output({ model_std <- stats::update( x, model.y = y_std, model.m = m_std, # control.value = control.value, treat.value = treat.value covariates = covs ) }) model_std } # Cannot ------------------------------------------------------------------ #' @export standardize.wbm <- function(x, ...) { .update_failed(class(x)) } #' @export standardize.Surv <- standardize.wbm #' @export standardize.clm2 <- standardize.wbm #' @export standardize.bcplm <- standardize.wbm #' @export standardize.wbgee <- standardize.wbm #' @export standardize.biglm <- standardize.wbm # biglm doesn't regit the model to new data - it ADDs MORE data to the model. #' @export # Almost the same as `standardize.default()` but we pass `use_calling_env` in # update(). standardize.fixest <- function( x, robust = FALSE, two_sd = FALSE, weights = TRUE, verbose = TRUE, include_response = TRUE, ... ) { data_std <- NULL # needed to avoid note .standardize_models( x, robust = robust, two_sd = two_sd, weights = weights, verbose = verbose, include_response = include_response, update_expr = stats::update(x, data = data_std, use_calling_env = FALSE), ... ) } # helper ---------------------------- # Find log-terms inside model formula, and return "clean" term names .log_terms <- function(model, data) { x <- insight::find_terms(model, flatten = TRUE) # log_pattern <- "^log\\((.*)\\)" log_pattern <- "(log\\(log|log|log1|log10|log1p|log2)\\(([^,\\+)]*).*" out <- insight::trim_ws(gsub( log_pattern, "\\2", grep(log_pattern, x, value = TRUE) )) intersect(colnames(data), out) } # Find log-terms inside model formula, and return "clean" term names .sqrt_terms <- function(model, data) { x <- insight::find_terms(model, flatten = TRUE) pattern <- "sqrt\\(([^,\\+)]*).*" out <- insight::trim_ws(gsub(pattern, "\\1", grep(pattern, x, value = TRUE))) intersect(colnames(data), out) } #' @keywords internal .safe_to_standardize_response <- function(info, verbose = TRUE) { if (is.null(info)) { if (verbose) { insight::format_warning( "Unable to verify if response should not be standardized.", "Response will be standardized." ) } return(TRUE) } # check if model has a response variable that should not be standardized. info$is_linear && info$family != "inverse.gaussian" && !info$is_survival && !info$is_censored # # alternative would be to keep something like: # !info$is_count && # !info$is_ordinal && # !info$is_multinomial && # !info$is_beta && # !info$is_censored && # !info$is_binomial && # !info$is_survival # # And then treating response for "Gamma()" or "inverse.gaussian" similar to # # log-terms... } #' @keywords internal .rescale_fixed_values <- function( val, cov_nm, y_data, m_data, y_data_std, m_data_std ) { if (cov_nm %in% colnames(y_data)) { temp_data <- y_data temp_data_std <- y_data_std } else { temp_data <- m_data temp_data_std <- m_data_std } rescale( val, to = range(temp_data_std[[cov_nm]]), range = range(temp_data[[cov_nm]]) ) } #' @keywords internal .update_failed <- function(class = NULL, ...) { if (is.null(class)) { msg1 <- "Unable to refit the model with standardized data." } else { msg1 <- sprintf( "Standardization of parameters not possible for models of class '%s'.", class ) } insight::format_error( msg1, "Try instead to standardize the data (standardize(data)) and refit the model manually." ) } ================================================ FILE: R/text_format.R ================================================ #' Convenient text formatting functionalities #' #' Convenience functions to manipulate and format text. #' #' @param text,text2 A character string. #' @param width Positive integer giving the target column width for wrapping #' lines in the output. Can be "auto", in which case it will select 90\% of the #' default width. #' @param pattern Regex pattern to remove from `text`. #' @param sep Separator. #' @param last Last separator. #' @param n The number of characters to find. #' @param enclose Character that will be used to wrap elements of `text`, so #' these can be, e.g., enclosed with quotes or backticks. If `NULL` (default), #' text elements will not be enclosed. #' @param ... Other arguments to be passed to or from other functions. #' #' @return A character string. #' #' @examples #' # Add full stop if missing #' text_fullstop(c("something", "something else.")) #' #' # Find last characters #' text_lastchar(c("ABC", "DEF"), n = 2) #' #' # Smart concatenation #' text_concatenate(c("First", "Second", "Last")) #' text_concatenate(c("First", "Second", "Last"), last = " or ", enclose = "`") #' #' # Remove parts of string #' text_remove(c("one!", "two", "three!"), "!") #' #' # Wrap text #' long_text <- paste(rep("abc ", 100), collapse = "") #' cat(text_wrap(long_text, width = 50)) #' #' # Paste with optional separator #' text_paste(c("A", "", "B"), c("42", "42", "42")) #' @export text_format <- function( text, sep = ", ", last = " and ", width = NULL, enclose = NULL, ... ) { text_wrap( text_concatenate(text, sep = sep, last = last, enclose = enclose), width = width ) } #' @rdname text_format #' @export text_fullstop <- function(text) { text[!text_lastchar(text) %in% c(".", ":", ",", ";", "!", "?")] <- paste0( text[text_lastchar(text) != "."], "." ) text } #' @rdname text_format #' @export text_lastchar <- function(text, n = 1) { vapply( text, function(xx) { substr(xx, (nchar(xx) - n + 1), nchar(xx)) }, FUN.VALUE = character(1L) ) } #' @rdname text_format #' @export text_concatenate <- function(text, sep = ", ", last = " and ", enclose = NULL) { if (length(text) == 1 && !nzchar(text, keepNA = TRUE)) { return(text) } text <- text[text != ""] # nolint if ( length(text) && !is.null(enclose) && length(enclose) == 1 && nzchar(enclose, keepNA = TRUE) ) { text <- paste0(enclose, text, enclose) } if (length(text) == 1) { s <- text } else { s <- paste(text[1:(length(text) - 1)], collapse = sep) s <- paste(c(s, text[length(text)]), collapse = last) } s } #' @rdname text_format #' @export text_paste <- function(text, text2 = NULL, sep = ", ", enclose = NULL, ...) { if (!is.null(text2)) { if ( !is.null(enclose) && length(enclose) == 1 && nzchar(enclose, keepNA = TRUE) ) { text <- vapply( text, function(i) { if (i != "") { i <- paste0(enclose, i, enclose) } i }, character(1L) ) text2 <- vapply( text2, function(i) { if (i != "") { i <- paste0(enclose, i, enclose) } i }, character(1L) ) } paste0(text, ifelse(text == "" | text2 == "", "", sep), text2) # nolint } } #' @rdname text_format #' @export text_remove <- function(text, pattern = "", ...) { gsub(pattern, "", text, ...) } #' @rdname text_format #' @export text_wrap <- function(text, width = NULL, ...) { width <- width %||% getOption("width") text <- strsplit(text, "\n", fixed = TRUE) text <- unlist(text, use.names = FALSE) wrapped <- "" for (s in text) { if (nchar(s) > width) { leading_spaces <- nchar(s) - nchar(insight::trim_ws(s)) s <- strwrap(s, width = width) s <- paste(s, collapse = "\n") s <- paste0(strrep(" ", leading_spaces), s) } wrapped <- paste0(wrapped, s, "\n") } wrapped } ================================================ FILE: R/to_factor.R ================================================ #' @title Convert data to factors #' @name to_factor #' #' @details #' Convert variables or data into factors. If the data is labelled, value labels #' will be used as factor levels. The counterpart to convert variables into #' numeric is `to_numeric()`. #' #' @param x A data frame or vector. #' @param labels_to_levels Logical, if `TRUE`, value labels are used as factor #' levels after `x` was converted to factor. Else, factor levels are based on #' the values of `x` (i.e. as if using `as.factor()`). #' @param ... Arguments passed to or from other methods. #' @inheritParams extract_column_names #' @inheritParams categorize #' #' @inheritSection center Selection of variables - the `select` argument #' #' @return A factor, or a data frame of factors. #' #' @note Factors are ignored and returned as is. If you want to use value labels #' as levels for factors, use [`labels_to_levels()`] instead. #' #' @examples #' str(to_factor(iris)) #' #' # use labels as levels #' data(efc) #' str(efc$c172code) #' head(to_factor(efc$c172code)) #' @export to_factor <- function(x, ...) { UseMethod("to_factor") } #' @export to_factor.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( sprintf( "Converting into factors values currently not possible for variables of class `%s`.", class(x)[1] ) ) } x } #' @export to_factor.factor <- function(x, ...) { x } #' @rdname to_factor #' @export to_factor.numeric <- function(x, labels_to_levels = TRUE, verbose = TRUE, ...) { # preserve labels variable_label <- attr(x, "label", exact = TRUE) value_labels <- attr(x, "labels", exact = TRUE) # to factor x <- as.factor(x) # add back labels attr(x, "label") <- variable_label attr(x, "labels") <- value_labels # value labels to factor levels if (labels_to_levels) { x <- .value_labels_to_levels(x, verbose = verbose, ...) } x } #' @export to_factor.logical <- to_factor.numeric #' @export to_factor.character <- to_factor.numeric #' @export to_factor.Date <- to_factor.numeric #' @export to_factor.haven_labelled <- to_factor.numeric #' @export to_factor.double <- to_factor.numeric #' @rdname to_factor #' @export to_factor.data.frame <- function( x, select = NULL, exclude = NULL, ignore_case = FALSE, append = FALSE, regex = FALSE, verbose = TRUE, ... ) { # validation check, return as is for complete factor if (all(vapply(x, is.factor, FUN.VALUE = logical(1L)))) { return(x) } # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # drop factors, when append is not FALSE select <- colnames(x[select])[ !vapply(x[select], is.factor, FUN.VALUE = logical(1L)) ] # process arguments my_args <- .process_append( x, select, append, append_suffix = "_f", keep_factors = FALSE, keep_character = TRUE, preserve_value_labels = TRUE ) # update processed arguments x <- my_args$x select <- my_args$select } x[select] <- lapply(x[select], to_factor, verbose = verbose, ...) x } ================================================ FILE: R/to_numeric.R ================================================ #' Convert data to numeric #' #' Convert data to numeric by converting characters to factors and factors to #' either numeric levels or dummy variables. The "counterpart" to convert #' variables into factors is `to_factor()`. #' #' @param x A data frame, factor or vector. #' @param dummy_factors Transform factors to dummy factors (all factor levels as #' different columns filled with a binary 0-1 value). #' @param preserve_levels Logical, only applies if `x` is a factor. If `TRUE`, #' and `x` has numeric factor levels, these will be converted into the related #' numeric values. If this is not possible, the converted numeric values will #' start from 1 to number of levels. #' @param lowest Numeric, indicating the lowest (minimum) value when converting #' factors or character vectors to numeric values. #' @param ... Arguments passed to or from other methods. #' @inheritParams extract_column_names #' @inheritParams categorize #' #' @note When factors should be converted into multiple "binary" dummies, i.e. #' each factor level is converted into a separate column filled with a binary #' 0-1 value, set `dummy_factors = TRUE`. If you want to preserve the original #' factor levels (in case these represent numeric values), use #' `preserve_levels = TRUE`. #' #' @section Selection of variables - `select` argument: #' For most functions that have a `select` argument the complete input data #' frame is returned, even when `select` only selects a range of variables. #' However, for `to_numeric()`, factors might be converted into dummies, #' thus, the number of variables of the returned data frame no longer match #' the input data frame. Hence, when `select` is used, *only* those variables #' (or their dummies) specified in `select` will be returned. Use `append=TRUE` #' to also include the original variables in the returned data frame. #' #' @examples #' to_numeric(head(ToothGrowth)) #' to_numeric(head(ToothGrowth), dummy_factors = TRUE) #' #' # factors #' x <- as.factor(mtcars$gear) #' to_numeric(x) #' to_numeric(x, preserve_levels = TRUE) #' # same as: #' coerce_to_numeric(x) #' #' @return A data frame of numeric variables. #' #' @export to_numeric <- function(x, ...) { UseMethod("to_numeric") } #' @export to_numeric.default <- function(x, verbose = TRUE, ...) { if (isTRUE(verbose)) { insight::format_alert( sprintf( "Converting into numeric values currently not possible for variables of class '%s'.", class(x)[1] ) ) } x } #' @rdname to_numeric #' @export to_numeric.data.frame <- function( x, select = NULL, exclude = NULL, dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # validation check, return as is for complete numeric if (all(vapply(x, is.numeric, FUN.VALUE = logical(1L)))) { return(x) } df_attr <- attributes(x) # evaluate arguments select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # when we append variables, we call ".process_append()", which will # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # drop numerics, when append is not FALSE select <- colnames(x[select])[ !vapply(x[select], is.numeric, FUN.VALUE = logical(1L)) ] # process arguments fun_args <- .process_append( x, select, append, append_suffix = "_n", keep_factors = TRUE ) # update processed arguments x <- fun_args$x select <- fun_args$select } out <- sapply( x[select], to_numeric, dummy_factors = dummy_factors, preserve_levels = preserve_levels, lowest = lowest, verbose = verbose, simplify = FALSE ) # save variable attributes attr_vars <- lapply(out, attributes) # "out" is currently a list, bind columns and to data frame out <- as.data.frame(do.call(cbind, out)) # set back attributes for (i in colnames(out)) { if (is.list(attr_vars[[i]])) { if (is.list(attributes(out[[i]]))) { attributes(out[[i]]) <- utils::modifyList( attr_vars[[i]], attributes(out[[i]]) ) } else { attributes(out[[i]]) <- attr_vars[[i]] } } } # due to the special handling of dummy factors, we need to take care # of appending the data here again. usually, "fun_args$x" includes the appended # data, which does not work here... if (!isFALSE(append)) { common_columns <- intersect(colnames(x), colnames(out)) if (length(common_columns)) { x[common_columns] <- NULL } out <- cbind(x, out) } # add back custom attributes out <- .replace_attrs(out, df_attr) out } #' @export to_numeric.numeric <- function(x, verbose = TRUE, ...) { .set_back_labels(as.numeric(x), x, reverse_values = FALSE) } #' @export to_numeric.double <- to_numeric.numeric #' @export to_numeric.logical <- to_numeric.numeric #' @export to_numeric.haven_labelled <- to_numeric.numeric #' @export to_numeric.Date <- function(x, verbose = TRUE, ...) { if (verbose) { insight::format_warning( paste0( "Converting a date-time variable of class `", class(x)[1], "` into numeric." ), "Please note that this conversion probably does not return meaningful results." ) } as.numeric(x) } #' @export to_numeric.POSIXt <- to_numeric.Date #' @export to_numeric.POSIXct <- to_numeric.Date #' @export to_numeric.POSIXlt <- to_numeric.Date #' @export to_numeric.factor <- function( x, dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, verbose = TRUE, ... ) { # preserving levels only works when factor levels are numeric if ( isTRUE(preserve_levels) && anyNA(suppressWarnings(as.numeric(as.character(stats::na.omit(x))))) ) { preserve_levels <- FALSE } if (dummy_factors) { out <- as.data.frame(stats::model.matrix( ~x, contrasts.arg = list(x = "contr.treatment") )) out[1] <- as.numeric(rowSums(out[2:ncol(out)]) == 0) # insert back NA rows. if "x" had missing values, model.matrix() creates an # array with only non-missing values, so some rows are missing. First, we # need to now which rows are missing (na_values) and the length of the # original vector (which will be the number of rows in the final data frame) na_values <- which(is.na(x)) rows_x <- length(x) if (any(na_values)) { # iterate all missing values that have for (i in seq_along(na_values)) { # if the first observation was missing, add NA row and bind data frame if (i == 1 && na_values[i] == 1) { out <- rbind(NA, out) } else if (na_values[i] == rows_x) { # if the last observation was NA, add NA row to data frame out <- rbind(out, NA) } else { # else, pick rows from beginning to current NA value, add NA, # and rbind the remaining rows out <- rbind( out[1:(na_values[i] - 1), ], NA, out[na_values[i]:nrow(out), ] ) } } rownames(out) <- NULL } names(out) <- levels(x) } else if (preserve_levels) { if (is.unsorted(levels(x))) { x_inverse <- rep(NA_real_, length(x)) for (i in 1:nlevels(x)) { x_inverse[x == levels(x)[i]] <- as.numeric(levels(x)[ nlevels(x) - i + 1 ]) } x <- factor(x_inverse) } out <- .set_back_labels( as.numeric(as.character(x)), x, reverse_values = FALSE ) } else { out <- .set_back_labels(as.numeric(x), x, reverse_values = FALSE) } # shift to requested starting value if (!is.null(lowest)) { difference <- min(out) - lowest out <- out - difference } out } #' @export to_numeric.character <- function( x, dummy_factors = FALSE, lowest = NULL, verbose = TRUE, ... ) { numbers <- vapply( x, function(i) { element <- tryCatch(str2lang(i), error = function(e) NULL) !is.null(element) && is.numeric(element) }, FUN.VALUE = logical(1L) ) if (all(numbers)) { out <- as.numeric(vapply(x, str2lang, FUN.VALUE = numeric(1L))) } else { out <- to_numeric(as.factor(x), dummy_factors = dummy_factors) } # shift to requested starting value if (!is.null(lowest)) { difference <- min(out) - lowest out <- out - difference } out } #' Convert to Numeric (if possible) #' #' Tries to convert vector to numeric if possible (if no warnings or errors). #' Otherwise, leaves it as is. #' #' @param x A vector to be converted. #' #' @examples #' coerce_to_numeric(c("1", "2")) #' coerce_to_numeric(c("1", "2", "A")) #' @return Numeric vector (if possible) #' @export coerce_to_numeric <- function(x) { tryCatch( as.numeric(as.character(x)), error = function(e) x, warning = function(w) x ) } ================================================ FILE: R/unnormalize.R ================================================ #' @rdname normalize #' @export unnormalize <- function(x, ...) { UseMethod("unnormalize") } #' @export unnormalize.default <- function(x, ...) { insight::format_error( "Variables of class '", class(x)[1], "' can't be unnormalized." ) } #' @rdname normalize #' @export unnormalize.numeric <- function(x, verbose = TRUE, ...) { ## TODO implement algorithm include_bounds = FALSE # if function called from the "grouped_df" method, we use the dw_transformer # attributes that were recovered in the "grouped_df" method dots <- match.call(expand.dots = FALSE)[["..."]] grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L)) if (is.null(grp_attr_dw)) { include_bounds <- attr(x, "include_bounds") min_value <- attr(x, "min_value") range_difference <- attr(x, "range_difference") to_range <- attr(x, "to_range") } else { names(grp_attr_dw) <- gsub(".*\\.", "", names(grp_attr_dw)) include_bounds <- grp_attr_dw["include_bounds"] min_value <- grp_attr_dw["min_value"] range_difference <- grp_attr_dw["range_difference"] to_range <- grp_attr_dw["to_range"] if (is.na(to_range)) { to_range <- NULL } } if (is.null(min_value) || is.null(range_difference)) { if (verbose) { insight::format_error( "Can't unnormalize variable. Information about range and/or minimum value is missing." ) } return(x) } if (is.null(to_range)) { x * range_difference + min_value } else { (x - to_range[1]) * (range_difference / diff(to_range)) + min_value } } #' @rdname normalize #' @export unnormalize.data.frame <- function( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, verbose = verbose ) # if function called from the "grouped_df" method, we use the dw_transformer # attributes that were recovered in the "grouped_df" method dots <- match.call(expand.dots = FALSE)[["..."]] if (is.null(dots$grp_attr_dw)) { grp_attr_dw <- NULL } else { grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L)) } for (i in select) { var_attr <- grep(paste0("^attr\\_", i, "\\."), names(grp_attr_dw)) attrs <- grp_attr_dw[var_attr] x[[i]] <- unnormalize(x[[i]], verbose = verbose, grp_attr_dw = attrs) } x } #' @rdname normalize #' @export unnormalize.grouped_df <- function( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, remove_group_var = TRUE, verbose = verbose ) info <- attributes(x) grps <- attr(x, "groups", exact = TRUE)[[".rows"]] x <- as.data.frame(x) for (i in select) { if (is.null(info$groups[[paste0("attr_", i)]])) { insight::format_error( paste( "Couldn't retrieve the necessary information to unnormalize", text_concatenate(i, enclose = "`") ) ) } } for (rows in seq_along(grps)) { # get the dw_transformer attributes for this group raw_attrs <- unlist(info$groups[ rows, startsWith(names(info$groups), "attr") ]) if (length(select) == 1L) { names(raw_attrs) <- paste0("attr_", select, ".", names(raw_attrs)) } tmp <- unnormalize( x[grps[[rows]], , drop = FALSE], select = select, exclude = exclude, ignore_case = ignore_case, regex = regex, verbose = verbose, grp_attr_dw = raw_attrs ) x[grps[[rows]], ] <- tmp } # set back class, so data frame still works with dplyr attributes(x) <- utils::modifyList(info, attributes(x)) class(x) <- c("grouped_df", class(x)) x } ================================================ FILE: R/unstandardize.R ================================================ #' @rdname standardize #' @export unstandardize <- function(x, ...) { UseMethod("unstandardize") } #' @rdname standardize #' @export unstandardise <- unstandardize #' @rdname standardize #' @export unstandardize.numeric <- function( x, center = NULL, scale = NULL, reference = NULL, robust = FALSE, two_sd = FALSE, ... ) { if (!is.null(reference)) { if (robust) { center <- stats::median(reference, na.rm = TRUE) scale <- stats::mad(reference, na.rm = TRUE) } else { center <- mean(reference, na.rm = TRUE) scale <- stats::sd(reference, na.rm = TRUE) } } else if (is.null(center) || is.null(scale)) { if (all(c("center", "scale") %in% names(attributes(x)))) { center <- attr(x, "center", exact = TRUE) scale <- attr(x, "scale", exact = TRUE) attr(x, "scale") <- attr(x, "center") <- NULL } else if ( all(c("scaled:center", "scaled:scale") %in% names(attributes(x))) ) { center <- attr(x, "scaled:center", exact = TRUE) scale <- attr(x, "scaled:scale", exact = TRUE) attr(x, "scaled:scale") <- attr(x, "scaled:center") <- NULL } else { insight::format_error( "You must provide the arguments `center`, `scale` or `reference`." ) } } if (two_sd) { scale <- 2 * scale } x * scale + center } #' @rdname standardize #' @export unstandardize.data.frame <- function( x, center = NULL, scale = NULL, reference = NULL, robust = FALSE, two_sd = FALSE, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # Select and deselect cols <- .select_nse( select, x, exclude = exclude, ignore_case, regex = regex, verbose = verbose ) dots <- match.call(expand.dots = FALSE)[["..."]] if (is.null(dots$grp_attr_dw)) { grp_attr_dw <- NULL } else { grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L)) } if (!is.null(grp_attr_dw)) { center <- vapply( cols, function(x) { grp_attr_dw[grep( paste0("^attr\\_", x, "\\.center"), names(grp_attr_dw) )] }, FUN.VALUE = numeric(1L) ) scale <- vapply( cols, function(x) { grp_attr_dw[grep(paste0("^attr\\_", x, "\\.scale"), names(grp_attr_dw))] }, FUN.VALUE = numeric(1L) ) i <- vapply(x[, cols, drop = FALSE], is.numeric, FUN.VALUE = logical(1L)) } else if (!is.null(reference)) { i <- vapply(x[, cols, drop = FALSE], is.numeric, FUN.VALUE = logical(1L)) i <- i[i] reference <- reference[names(i)] if (robust) { center <- vapply( reference, FUN.VALUE = numeric(1L), stats::median, na.rm = TRUE ) scale <- vapply( reference, FUN.VALUE = numeric(1L), stats::mad, na.rm = TRUE ) } else { center <- vapply(reference, FUN.VALUE = numeric(1L), mean, na.rm = TRUE) scale <- vapply( reference, FUN.VALUE = numeric(1L), stats::sd, na.rm = TRUE ) } } else if (is.null(center) || is.null(scale)) { i <- vapply( x[, cols, drop = FALSE], function(k) { a <- attributes(k) is.numeric(k) && !is.null(a) && all(c("scale", "center") %in% names(a)) }, FUN.VALUE = logical(1L) ) if (any(i)) { i <- i[i] center <- vapply( x[names(i)], FUN.VALUE = numeric(1L), attr, "center", exact = TRUE ) scale <- vapply( x[names(i)], FUN.VALUE = numeric(1L), attr, "scale", exact = TRUE ) } else if (all(c("center", "scale") %in% names(attributes(x)))) { center <- attr(x, "center", exact = TRUE) scale <- attr(x, "scale", exact = TRUE) attr(x, "center") <- attr(x, "scale") <- NULL i <- names(x) %in% names(scale) i <- i[i] } else { insight::format_error( "You must provide the arguments `center`, `scale` or `reference`." ) } } else { if (is.null(names(center))) { i <- vapply(x, is.numeric, FUN.VALUE = logical(1L)) names(center) <- names(scale) <- names(x[i]) } i <- names(x) %in% names(center) names(i) <- names(x) i <- i[i] } if (two_sd) { scale <- 2 * scale } cols <- names(i) # Apply unstandardization to cols for (col in cols) { x[col] <- unstandardize( x[[col]], center = center[[col]], scale = scale[[col]] ) } x } #' @export unstandardize.factor <- function(x, ...) { x } #' @export unstandardize.character <- function(x, ...) { x } #' @export unstandardize.grouped_df <- function( x, center = NULL, scale = NULL, reference = NULL, robust = FALSE, two_sd = FALSE, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) { # evaluate select/exclude, may be select-helpers select <- .select_nse( select, x, exclude, ignore_case, regex = regex, remove_group_var = TRUE, verbose = verbose ) info <- attributes(x) grps <- attr(x, "groups", exact = TRUE)[[".rows"]] x <- as.data.frame(x) for (i in select) { if (is.null(info$groups[[paste0("attr_", i)]])) { insight::format_error( paste( "Couldn't retrieve the necessary information to unstandardize", text_concatenate(i, enclose = "`") ) ) } } for (rows in seq_along(grps)) { # get the dw_transformer attributes for this group raw_attrs <- unlist(info$groups[ rows, startsWith(names(info$groups), "attr") ]) if (length(select) == 1L) { names(raw_attrs) <- paste0("attr_", select, ".", names(raw_attrs)) } tmp <- unstandardise( x[grps[[rows]], , drop = FALSE], center = center, scale = scale, reference = reference, robust = robust, two_sd = two_sd, select = select, exclude = exclude, ignore_case = ignore_case, regex = regex, verbose = verbose, grp_attr_dw = raw_attrs ) x[grps[[rows]], ] <- tmp } # set back class, so data frame still works with dplyr attributes(x) <- utils::modifyList(info, attributes(x)) class(x) <- c("grouped_df", class(x)) x } #' @export unstandardize.matrix <- function( x, center = NULL, scale = NULL, reference = NULL, robust = FALSE, two_sd = FALSE, ... ) { if (all(c("scaled:center", "scaled:scale") %in% names(attributes(x)))) { center <- attr(x, "scaled:center", exact = TRUE) scale <- attr(x, "scaled:scale", exact = TRUE) attr(x, "scaled:center") <- attr(x, "scaled:scale") <- NULL for (col in seq_len(ncol(x))) { x[, col] <- unstandardize.numeric( x[, col], center = center[col], scale = scale[col] ) } } else { scales <- attr(x, "scale") centers <- attr(x, "center") xl <- lapply(seq_len(ncol(x)), function(i) { tmp <- x[, i] attributes(tmp) <- list(center = centers[i], scale = scales[i]) tmp }) xz <- lapply(xl, datawizard::unstandardize, ...) x_out <- do.call(cbind, xz) dimnames(x_out) <- dimnames(x) x <- x_out } x } #' @export unstandardize.array <- unstandardize.matrix # Datagrid ---------------------------------------------------------------- #' @export unstandardize.datagrid <- function(x, ...) { x[names(x)] <- unstandardize( as.data.frame(x), reference = attributes(x)$data, ... ) x } #' @export unstandardize.visualisation_matrix <- unstandardize.datagrid ================================================ FILE: R/utils-cols.R ================================================ #' Tools for working with column names #' #' @param x A data frame. #' @param row Row to use as column names. #' @param na_prefix Prefix to give to the column name if the row has an `NA`. #' Default is 'x', and it will be incremented at each `NA` (`x1`, `x2`, etc.). #' @param verbose Toggle warnings. #' @param prefix Prefix to give to the column name. Default is 'x', and it will #' be incremented at each column (`x1`, `x2`, etc.). #' #' @return #' `row_to_colnames()` and `colnames_to_row()` both return a data frame. #' #' @rdname colnames #' #' @export #' #' @examples #' # Convert a row to column names -------------------------------- #' test <- data.frame( #' a = c("iso", 2, 5), #' b = c("year", 3, 6), #' c = c("value", 5, 7) #' ) #' test #' row_to_colnames(test) #' #' # Convert column names to row -------------------------------- #' test <- data.frame( #' ARG = c("BRA", "FRA"), #' `1960` = c(1960, 1960), #' `2000` = c(2000, 2000) #' ) #' test #' colnames_to_row(test) #' row_to_colnames <- function(x, row = 1, na_prefix = "x", verbose = TRUE) { if (!is.numeric(row)) { insight::format_error("Argument `row` must be of type numeric.") } if (length(row) != 1) { insight::format_error("Argument `row` must be of length 1.") } if (nrow(x) < row) { insight::format_error( paste0( "You used row = ", row, " but the dataset only has ", nrow(x), " rows." ) ) } new_colnames <- as.character(unlist(x[row, ], use.names = FALSE)) # Create default colnames if there are NAs in the row used which_na <- which(is.na(new_colnames)) n_na <- length(which_na) if (n_na > 0) { for (i in seq_along(which_na)) { new_colnames[which_na[i]] <- paste0(na_prefix, i) } if (verbose) { insight::format_warning( paste0( "Some values of row ", row, " were NAs. The corresponding column names are prefixed with `", na_prefix, "`." ) ) } } colnames(x) <- new_colnames x[-row, ] } #' @rdname colnames #' @export colnames_to_row <- function(x, prefix = "x") { if (length(prefix) != 1) { insight::format_error("Argument `prefix` must be of length 1.") } if (!is.character(prefix)) { insight::format_error("Argument `prefix` must be of type character.") } x2 <- rbind(colnames(x), x) colnames(x2) <- paste0(prefix, seq_len(ncol(x2))) x2 } ================================================ FILE: R/utils-rows.R ================================================ #' Tools for working with row names or row ids #' #' @param x A data frame. #' @param var Name of column to use for row names/ids. For `column_as_rownames()`, #' this argument can be the variable name or the column number. For #' `rownames_as_column()` and `rowid_as_column()`, the column name must not #' already exist in the data. #' #' @details #' These are similar to `tibble`'s functions `column_to_rownames()`, #' `rownames_to_column()` and `rowid_to_column()`. Note that the behavior of #' `rowid_as_column()` is different for grouped dataframe: instead of making #' the rowid unique across the full dataframe, it creates rowid per group. #' Therefore, there can be several rows with the same rowid if they belong to #' different groups. #' #' If you are familiar with `dplyr`, this is similar to doing the following: #' ```r #' data |> #' group_by(grp) |> #' mutate(id = row_number()) |> #' ungroup() #' ``` #' #' @return #' A data frame. #' #' @rdname rownames #' #' @examples #' # Convert between row names and column -------------------------------- #' test <- rownames_as_column(mtcars, var = "car") #' test #' head(column_as_rownames(test, var = "car")) #' #' @export rownames_as_column <- function(x, var = "rowname") { if (!insight::object_has_rownames(x)) { insight::format_error("The data frame doesn't have rownames.") } if (is.null(var)) { var <- "rowname" } if (!is.character(var)) { insight::format_error("Argument 'var' must be of type character.") } if (var %in% colnames(x)) { insight::format_error( paste0("There is already a variable named `", var, "` in your dataset.") ) } original_x <- x rn <- data.frame(rn = rownames(x), stringsAsFactors = FALSE) x <- cbind(rn, x) colnames(x)[1] <- var rownames(x) <- NULL x <- .replace_attrs(x, attributes(original_x)) x } #' @rdname rownames #' @export column_as_rownames <- function(x, var = "rowname") { if (!is.character(var) && !is.numeric(var)) { insight::format_error( "Argument `var` must be of type character or numeric." ) } if (is.character(var) && !var %in% names(x)) { insight::format_error(paste0( "Variable \"", var, "\" is not in the data frame." )) } if (is.numeric(var) && (var > ncol(x) || var <= 0)) { insight::format_error( "Column ", var, " does not exist. There are ", ncol(x), " columns in the data frame." ) } original_x <- x rownames(x) <- x[[var]] x[[var]] <- NULL x <- .replace_attrs(x, attributes(original_x)) x } #' @rdname rownames #' @export #' @examples #' test_data <- head(iris) #' #' rowid_as_column(test_data) #' rowid_as_column(test_data, var = "my_id") rowid_as_column <- function(x, var = "rowid") { UseMethod("rowid_as_column") } #' @export rowid_as_column.default <- function(x, var = "rowid") { if (is.null(var)) { var <- "rowid" } if (!is.character(var)) { insight::format_error("Argument 'var' must be of type character.") } if (var %in% colnames(x)) { insight::format_error( paste0("There is already a variable named `", var, "` in your dataset.") ) } original_x <- x rn <- data.frame(rn = seq_len(nrow(x)), stringsAsFactors = FALSE) x <- cbind(rn, x) colnames(x)[1] <- var rownames(x) <- NULL x <- .replace_attrs(x, attributes(original_x)) x } #' @export rowid_as_column.grouped_df <- function(x, var = "rowid") { if (!is.character(var)) { insight::format_error("Argument 'var' must be of type character.") } if (var %in% colnames(x)) { insight::format_error( paste0("There is already a variable named `", var, "` in your dataset.") ) } grps <- attr(x, "groups", exact = TRUE) grps <- grps[[".rows"]] for (i in seq_along(grps)) { x[grps[[i]], var] <- seq_along(grps[[i]]) } # can't just put select = "var" because there could be another variable # called var x <- data_relocate(x, paste0("^", var, "$"), regex = TRUE, before = 1) x } ================================================ FILE: R/utils.R ================================================ #' @keywords internal .get_model_info <- function(model, model_info = NULL, ...) { if (is.null(model_info)) { model_info <- insight::model_info(model) } model_info } #' `NULL` coalescing operator #' #' @keywords internal #' @noRd `%||%` <- function(x, y) { if (is.null(x)) y else x } #' Try to convert object to a dataframe #' #' @keywords internal #' @noRd .coerce_to_dataframe <- function(data) { if (!is.data.frame(data)) { data <- tryCatch( as.data.frame(data, stringsAsFactors = FALSE), error = function(e) { insight::format_error( "`data` must be a data frame, or an object that can be coerced to a data frame." ) } ) } data } #' Checks dataframes for syntactically valid column names #' Argument "action" can be "warning", "error", or "message". #' #' @keywords internal #' @noRd .check_dataframe_names <- function(x, action = "warning", verbose = TRUE) { if (verbose && !all(colnames(x) == make.names(colnames(x), unique = TRUE))) { insight::format_alert( "Bad column names (e.g., with spaces) have been detected which might create issues in many functions.", paste0( "We recommend to rename following columns: ", text_concatenate( colnames(x)[colnames(x) != make.names(colnames(x), unique = TRUE)], enclose = "`" ) ), "You can run `names(mydata) <- make.names(names(mydata))` or use `janitor::clean_names()` for a quick fix.", # nolint type = action ) } } #' Fuzzy grep, matches pattern that are close, but not identical #' @examples #' colnames(iris) #' p <- sprintf("(%s){~%i}", "Spela", 2) #' grep(pattern = p, x = colnames(iris), ignore.case = FALSE) #' @keywords internal #' @noRd .fuzzy_grep <- function(x, pattern, precision = NULL) { if (is.null(precision)) { precision <- round(nchar(pattern) / 3) } if (precision > nchar(pattern)) { return(NULL) } p <- sprintf("(%s){~%i}", pattern, precision) grep(pattern = p, x = x, ignore.case = FALSE) } #' create a message string to tell user about matches that could possibly #' be the string they were looking for #' #' @keywords internal #' @noRd .misspelled_string <- function(source, searchterm, default_message = NULL) { if (is.null(searchterm) || length(searchterm) < 1) { return(default_message) } # used for many matches more_found <- "" # init default msg <- "" # guess the misspelled string possible_strings <- unlist( lapply(searchterm, function(s) { source[.fuzzy_grep(source, s)] # nolint }), use.names = FALSE ) if (length(possible_strings)) { msg <- "Did you mean " if (length(possible_strings) > 1) { # make sure we don't print dozens of alternatives for larger data frames if (length(possible_strings) > 5) { more_found <- sprintf( " We even found %i more possible matches, not shown here.", length(possible_strings) - 5 ) possible_strings <- possible_strings[1:5] } msg <- paste0( msg, "one of ", text_concatenate(possible_strings, enclose = "\"", last = " or ") ) } else { msg <- paste0(msg, "\"", possible_strings, "\"") } msg <- paste0(msg, "?", more_found) } else { msg <- default_message } # no double white space insight::trim_ws(msg) } #' Check that a vector is sorted #' @noRd #' @keywords internal .is_sorted <- Negate(is.unsorted) #' Replace only custom attributes #' #' Using "attributes(out) <- attributes(data)" or similar doesn't work so well #' for big datasets because it takes some time to attribute the row names. #' #' This function gives only custom attributes to the new dataset. #' @noRd #' @keywords internal .replace_attrs <- function(data, custom_attr) { for (nm in setdiff(names(custom_attr), names(attributes(data.frame())))) { attr(data, which = nm) <- custom_attr[[nm]] } data } #' @keywords internal .is_date <- function(x) { inherits(x, "Date") } #' @keywords internal .are_weights <- function(w) { !is.null(w) && length(w) && !all(w == 1) && !all(w == w[1]) } #' @keywords internal .factor_to_numeric <- function(x) { # no need to change for numeric if (is.numeric(x)) { return(x) } # Dates can be coerced by as.numeric(), w/o as.character() if (inherits(x, "Date")) { return(as.numeric(x)) } # Logicals should be 0/1 if (is.logical(x)) { return(as.numeric(x)) } if (anyNA(suppressWarnings(as.numeric(as.character(stats::na.omit(x)))))) { if (is.character(x)) { x <- as.factor(x) } levels(x) <- 1:nlevels(x) } as.numeric(as.character(x)) } # For standardize_parameters ---------------------------------------------- #' Taken from https://github.com/coolbutuseless/gluestick (licence: MIT) #' Same functionality as `{glue}` #' #' @noRd #' @keywords internal .gluestick <- function( fmt, src = parent.frame(), open = "{", close = "}", eval = TRUE ) { nchar_open <- nchar(open) nchar_close <- nchar(close) # validation checks stopifnot(exprs = { is.character(fmt) length(fmt) == 1L is.character(open) length(open) == 1L nchar_open > 0L is.character(close) length(close) == 1 nchar_close > 0 }) # Brute force the open/close characters into a regular expression for # extracting the expressions from the format string open <- gsub("(.)", "\\\\\\1", open) # Escape everything!! close <- gsub("(.)", "\\\\\\1", close) # Escape everything!! re <- paste0(open, ".*?", close) # Extract the delimited expressions matches <- gregexpr(re, fmt) exprs <- regmatches(fmt, matches)[[1]] # Remove the delimiters exprs <- substr(exprs, nchar_open + 1L, nchar(exprs) - nchar_close) # create a valid sprintf fmt string. # - replace all "{expr}" strings with "%s" # - escape any '%' so sprintf() doesn't try and use them for formatting # but only if the '%' is NOT followed by an 's' # # gluestick() doesn't deal with any pathological cases fmt_sprintf <- gsub(re, "%s", fmt) fmt_sprintf <- gsub("%(?!s)", "%%", fmt_sprintf, perl = TRUE) # Evaluate if (eval) { fun_args <- lapply(exprs, function(expr) { eval(parse(text = expr), envir = src) }) } else { fun_args <- unname(mget(exprs, envir = as.environment(src))) } # Create the string(s) do.call(sprintf, c(list(fmt_sprintf), fun_args)) } #' help-functions #' @keywords internal #' @noRd .data_frame <- function(...) { x <- data.frame(..., stringsAsFactors = FALSE) rownames(x) <- NULL x } ================================================ FILE: R/utils_labels.R ================================================ # after data transformation, label attributes get lost. This function # extracts label attributes from the original vector and adds them back # to the transformed vector #' @keywords internal .set_back_labels <- function( new, old, include_values = TRUE, reverse_values = FALSE ) { # labelled data? attr(new, "label") <- attr(old, "label", exact = TRUE) value_labels <- attr(old, "labels", exact = TRUE) # "include_values" is used to preserve value labels if (isTRUE(include_values) && !is.null(value_labels)) { if (reverse_values) { # reverse values? Used for "reverse_scale()" attr(new, "labels") <- stats::setNames( rev(value_labels), names(value_labels) ) } else if (is.numeric(new)) { # keep value oder? Used for "to_numeric()" if (any(grepl("[^0-9]", value_labels))) { # if we have any non-numeric characters, convert to numeric attr(new, "labels") <- stats::setNames( as.numeric(as.factor(value_labels)), names(value_labels) ) } else { # if we have numeric, or "numeric character" (like "1", "2", "3" etc.) attr(new, "labels") <- stats::setNames( as.numeric(value_labels), names(value_labels) ) } } else { attr(new, "labels") <- stats::setNames(value_labels, names(value_labels)) } } else if (isFALSE(include_values)) { attr(new, "labels") <- NULL } new } # This functions converts value labels that are saved as attributes # into factor levels #' @keywords internal .value_labels_to_levels <- function(x, verbose = TRUE, ...) { # extract value labels value_labels <- attr(x, "labels", exact = TRUE) # return, if none if (is.null(value_labels)) { return(x) } # check positions of matching values and levels levels_in_labs <- stats::na.omit(match(value_labels, levels(x))) labs_in_levels <- stats::na.omit(match(levels(x), value_labels)) # validation check - if labelled values and levels don't match if (!length(levels_in_labs) || !length(labs_in_levels)) { if (verbose) { insight::format_alert( "Could not use value labels as factor levels.", "Labelled values and factor levels had no match." ) } return(x) } # check if all levels have matching labels, and if not, tell user if (verbose && nlevels(x) != length(levels_in_labs)) { insight::format_alert( "Not all factor levels had a matching value label. Non-matching levels were preserved." ) } # we need to find out which levels have no labelled value missing_levels <- levels(x)[!levels(x) %in% value_labels] # and we need to remove those value labels that don't have a matching level value_labels <- value_labels[value_labels %in% levels(x)] # for levels that have no label, we just keep the original factor level value_labels <- c( value_labels, stats::setNames(missing_levels, missing_levels) ) # now we can add back levels levels(x) <- names(value_labels)[order(as.numeric(value_labels))] attr(x, "labels") <- NULL x } ================================================ FILE: R/utils_standardize_center.R ================================================ # preparation for standardize and center ---- # # Performs some preparation when standardizing or centering variables, # like finding the center or scale, also in relation to some reference values. # This function is applied to *vectors*. # #' @keywords internal .process_std_center <- function( x, weights, robust, verbose = TRUE, reference = NULL, center = NULL, scale = NULL ) { # Warning if all NaNs if (all(is.na(x) | is.infinite(x))) { return(NULL) } if (.are_weights(weights)) { valid_x <- !is.na(x) & !is.na(weights) & !is.infinite(x) & !is.infinite(weights) na_values <- is.na(x) | is.na(weights) inf_values <- is.infinite(x) | is.infinite(weights) vals <- x[valid_x] weights <- weights[valid_x] } else { valid_x <- !is.na(x) & !is.infinite(x) na_values <- is.na(x) inf_values <- is.infinite(x) vals <- x[valid_x] } # validation checks check <- .check_standardize_numeric( x, name = NULL, verbose = verbose, reference = reference, center = center ) if (is.factor(vals) || is.character(vals)) { vals <- .factor_to_numeric(vals) } # Get center and scale ref <- .get_center_scale( vals, robust, weights, reference, .center = center, .scale = scale, verbose = verbose ) list( vals = vals, valid_x = valid_x, center = ref$center, scale = ref$scale, check = check, na_values = na_values, inf_values = inf_values ) } # processing and checking of arguments ---- # # Performs some preparation when standardizing or centering variables, # like finding the center or scale, also in relation to some reference values. # This function is applied to the *data frame methods*. # #' @keywords internal .process_std_args <- function( x, select, exclude, weights, append, append_suffix = "_z", keep_factors, remove_na = "none", reference = NULL, .center = NULL, .scale = NULL, keep_character = FALSE, preserve_value_labels = FALSE ) { # check append argument, and set default if (isFALSE(append)) { append <- NULL } else if (isTRUE(append)) { append <- append_suffix } if (!is.null(weights) && is.character(weights)) { if (weights %in% colnames(x)) { exclude <- c(exclude, weights) } else { insight::format_warning( paste0( "Could not find weighting column `", weights, "`. Weighting not carried out." ) ) weights <- NULL } } select <- .select_variables(x, select, exclude, keep_factors, keep_character) # check if selected variables are in reference if (!is.null(reference) && !all(select %in% names(reference))) { insight::format_error( "The `reference` must include all variables from `select`." ) } # copy label attributes variable_labels <- insight::compact_list(lapply( x, attr, "label", exact = TRUE )) value_labels <- NULL if (preserve_value_labels) { value_labels <- insight::compact_list(lapply( x, attr, "labels", exact = TRUE )) } # drop NAs remove_na <- match.arg(remove_na, c("none", "selected", "all")) omit <- switch( remove_na, none = logical(nrow(x)), selected = rowSums(vapply(x[select], is.na, FUN.VALUE = logical(nrow(x)))) > 0, all = rowSums(vapply(x, is.na, FUN.VALUE = logical(nrow(x)))) > 0 ) x <- x[!omit, , drop = FALSE] if (!is.null(weights) && is.character(weights)) { weights <- x[[weights]] } # append standardized variables if (!is.null(append) && append != "") { new_variables <- x[select] colnames(new_variables) <- paste0(colnames(new_variables), append) if (length(variable_labels)) { variable_labels <- c( variable_labels, stats::setNames(variable_labels[select], colnames(new_variables)) ) } if (length(value_labels)) { value_labels <- c( value_labels, stats::setNames(value_labels[select], colnames(new_variables)) ) } x <- cbind(x, new_variables) select <- colnames(new_variables) } # check for reference center and scale if (!is.null(.center)) { # for center(), we have no scale - set it to default value if (is.null(.scale)) { .scale <- rep(1, length(.center)) } # center and scale must have same length if (length(.center) != length(.scale)) { insight::format_error("`center` and `scale` must be of same length.") } # center and scale must either be of length 1 or of same length as selected variables if (length(.center) > 1 && length(.center) != length(select)) { insight::format_error( "`center` and `scale` must have the same length as the selected variables for standardization or centering." ) } # if of length 1, recycle if (length(.center) == 1) { .center <- rep(.center, length(select)) .scale <- rep(.scale, length(select)) } # set names if (is.null(names(.center))) { .center <- stats::setNames(.center, select) } if (is.null(names(.scale))) { .scale <- stats::setNames(.scale, select) } } else { # use NA if missing, so we can index these as vectors .center <- stats::setNames(rep(NA, length(select)), select) .scale <- stats::setNames(rep(NA, length(select)), select) } # add back variable labels if (length(variable_labels)) { for (i in names(variable_labels)) { attr(x[[i]], "label") <- variable_labels[[i]] } } if (preserve_value_labels && length(value_labels)) { for (i in names(value_labels)) { attr(x[[i]], "labels") <- value_labels[[i]] } } list( x = x, select = select, exclude = exclude, weights = weights, append = append, center = .center, scale = .scale ) } # retrieve center and scale information ---- #' @keywords internal .get_center_scale <- function( x, robust = FALSE, weights = NULL, reference = NULL, .center = NULL, .scale = NULL, verbose = TRUE ) { if (is.null(reference)) { reference <- x } # for center(), we have no scale. default to 1 if (is.null(.scale) || is.na(.scale) || isFALSE(.scale)) { scale <- 1 } else if (isTRUE(.scale)) { if (robust) { scale <- weighted_mad(reference, weights) } else { scale <- weighted_sd(reference, weights) } } else { # we must have a numeric value here scale <- .scale } # process center if (is.null(.center) || is.na(.center) || isFALSE(.center)) { center <- 0 } else if (isTRUE(.center)) { if (robust) { center <- weighted_median(reference, weights) } else { center <- weighted_mean(reference, weights) } } else { # we must have a numeric value here center <- .center } if (scale == 0) { scale <- 1 if (verbose) { insight::format_warning(sprintf( "%s is 0 - variable not standardized (only scaled).", if (robust) "MAD" else "SD" )) } } list(center = center, scale = scale) } # check range of input variables ---- #' @keywords internal .check_standardize_numeric <- function( x, name = NULL, verbose = TRUE, reference = NULL, center ) { # Warning if only one value if ( insight::has_single_value(x) && is.null(reference) && (is.null(center) || isTRUE(center)) ) { if (verbose) { if (is.null(name)) { insight::format_alert( "The variable contains only one unique value and will be set to 0." ) } else { insight::format_alert( paste0( "The variable `", name, "` contains only one unique value and will be set to 0." ) ) } } return(NULL) } # Warning if logical vector if ( verbose && insight::n_unique(x) == 2 && !is.factor(x) && !is.character(x) ) { if (is.null(name)) { insight::format_alert( "The variable contains only two different values. Consider converting it to a factor." ) } else { insight::format_alert( paste0( "Variable `", name, "` contains only two different values. Consider converting it to a factor." ) ) } } x } # process append argument ---- #' @keywords internal .process_append <- function( x, select, append, append_suffix = "_z", preserve_value_labels = FALSE, keep_factors = TRUE, keep_character = FALSE ) { # check append argument, and set default if (isFALSE(append)) { append <- NULL } else if (isTRUE(append)) { append <- append_suffix } # append recoded variables if (!is.null(append) && append != "") { # keep or drop factors and characters select <- .select_variables( x, select, exclude = NULL, keep_factors = keep_factors, keep_character = keep_character ) # copy label attributes variable_labels <- insight::compact_list(lapply( x, attr, "label", exact = TRUE )) value_labels <- NULL if (preserve_value_labels) { value_labels <- insight::compact_list(lapply( x, attr, "labels", exact = TRUE )) } # add new variables that sould be appended new_variables <- x[select] colnames(new_variables) <- paste0(colnames(new_variables), append) if (length(variable_labels)) { variable_labels <- c( variable_labels, stats::setNames(variable_labels[select], colnames(new_variables)) ) } if (length(value_labels)) { value_labels <- c( value_labels, stats::setNames(value_labels[select], colnames(new_variables)) ) } x <- cbind(x, new_variables) select <- colnames(new_variables) # add back variable labels if (length(variable_labels)) { for (i in names(variable_labels)) { attr(x[[i]], "label") <- variable_labels[[i]] } } if (preserve_value_labels && length(value_labels)) { for (i in names(value_labels)) { attr(x[[i]], "labels") <- value_labels[[i]] } } } list(x = x, select = select) } # variables to standardize and center ---- # # This function mainly serves the purpose to keep or drop factors and # character vectors from transformation functions. # #' @keywords internal .select_variables <- function( x, select, exclude, keep_factors, keep_character = FALSE ) { if (is.null(select)) { select <- names(x) } if (!is.null(exclude)) { select <- setdiff(select, exclude) } if (!keep_factors) { if (!keep_character) { factors <- vapply( x[select], function(i) is.factor(i) | is.character(i), FUN.VALUE = logical(1L) ) } else { factors <- vapply(x[select], is.factor, FUN.VALUE = logical(1L)) } select <- select[!factors] } select } # for grouped df --------------------------- #' @keywords internal .process_grouped_df <- function( x, select, exclude, append, append_suffix = "_z", reference, weights, keep_factors ) { if (!is.null(reference)) { insight::format_error( "The `reference` argument cannot be used with grouped standardization for now." ) } # check append argument, and set default if (isFALSE(append)) { append <- NULL } else if (isTRUE(append)) { append <- append_suffix } info <- attributes(x) grps <- attr(x, "groups", exact = TRUE)[[".rows"]] # for grouped data frames, we can decide to remove group variable from selection grp_vars <- setdiff(colnames(attr(x, "groups", exact = TRUE)), ".rows") if (is.numeric(weights)) { insight::format_warning( "For grouped data frames, `weights` must be a character, not a numeric vector.", "Ignoring weightings." ) weights <- NULL } x <- as.data.frame(x) select <- .select_variables(x, select, exclude, keep_factors) select <- setdiff(select, grp_vars) # append standardized variables if (!is.null(append) && append != "") { new_variables <- x[select] colnames(new_variables) <- paste0(colnames(new_variables), append) x <- cbind(x, new_variables) select <- colnames(new_variables) info$names <- c(info$names, select) } list(x = x, info = info, select = select, grps = grps, weights = weights) } ================================================ FILE: R/visualisation_recipe.R ================================================ #' Prepare objects for visualisation #' #' @description This function prepares objects for visualisation by returning a list of #' layers with data and geoms that can be easily plotted using for instance #' `ggplot2`. #' #' If the `see` package is installed, the call to `visualization_recipe()` can be #' replaced by `plot()`, which will internally call the former and then plot it #' using `ggplot`. The resulting plot can be customized ad-hoc (by adding #' ggplot's geoms, theme or specifications), or via some of the arguments #' of `visualisation_recipe()` that control the aesthetic parameters. #' #' See the specific documentation page for your object's class: #' #' - {modelbased}: #' - {correlation}: #' #' @param x An `easystats` object. #' @param ... Other arguments passed to other functions. #' #' @export visualisation_recipe <- function(x, ...) { UseMethod("visualisation_recipe") } #' @export print.visualisation_recipe <- function(x, ...) { for (i in seq_along(x)) { l <- x[[paste0("l", i)]] insight::print_color(paste0("Layer ", i, "\n--------\n"), "blue") insight::print_color( paste0("Geom type: ", ifelse(is.null(l$geom), "[NULL]", l$geom), "\n"), "yellow" ) elements <- names(l)[!vapply(l, is.null, FUN.VALUE = logical(1L))] # Loop through all elements of list for (element in elements[elements != "geom"]) { # Print element name if (element == "aes") { cat("aes_string(\n") } else { cat(paste0(element, " = ")) } # Print element if (element == "data") { cat(paste0("[", paste0(dim(l$data), collapse = " x "), "]")) } else if (element == "aes") { for (aes in names(l$aes)) { if (!is.null(l$aes[[aes]])) { if (is.character(l$aes[[aes]])) { cat(paste0(" ", aes, " = '", l$aes[[aes]], "'\n")) } else { cat(paste0(" ", aes, " = ", l$aes[[aes]], "\n")) } } } cat(")") } else { if ( is.character(l[[element]]) || is.numeric(l[[element]]) || is.factor(l[[element]]) ) { if (is.character(l[[element]])) { cat(paste0("'", l[[element]], "'")) } else { if (length(l[[element]]) == 1) { cat(l[[element]]) } else { cat(paste0("c(", toString(l[[element]]), ")")) } } } else { cat(paste0("class: ", class(l[[element]]), collapse = "/")) } } cat("\n") } cat("\n") } } #' @export plot.visualisation_recipe <- function(x, ...) { insight::check_if_installed("see") NextMethod() } ================================================ FILE: R/weighted_mean_median_sd_mad.R ================================================ #' Weighted Mean, Median, SD, and MAD #' #' @inheritParams stats::weighted.mean #' @inheritParams stats::mad #' @param weights A numerical vector of weights the same length as `x` giving #' the weights to use for elements of `x`. If `weights = NULL`, `x` is passed #' to the non-weighted function. #' @param verbose Show warning when `weights` are negative? #' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) and infinite #' values from `x` and `weights`. #' #' @examples #' ## GPA from Siegel 1994 #' x <- c(3.7, 3.3, 3.5, 2.8) #' wt <- c(5, 5, 4, 1) / 15 #' #' weighted_mean(x, wt) #' weighted_median(x, wt) #' #' weighted_sd(x, wt) #' weighted_mad(x, wt) #' #' @export weighted_mean <- function( x, weights = NULL, remove_na = TRUE, verbose = TRUE, ... ) { if (!.are_weights(weights) || !.validate_weights(weights, verbose)) { return(mean(x, na.rm = remove_na)) } # remove missings complete <- .clean_missings(x, weights, remove_na) stats::weighted.mean(complete$x, complete$weights, na.rm = remove_na) } #' @export #' @rdname weighted_mean weighted_median <- function( x, weights = NULL, remove_na = TRUE, verbose = TRUE, ... ) { if (!.are_weights(weights) || !.validate_weights(weights, verbose)) { return(stats::median(x, na.rm = remove_na)) } p <- 0.5 # split probability # remove missings complete <- .clean_missings(x, weights, remove_na) order <- order(complete$x) x <- complete$x[order] weights <- complete$weights[order] rw <- cumsum(weights) / sum(weights) # validation check if (all(is.na(rw))) { return(NA_real_) } md.values <- min(which(rw >= p)) if (rw[md.values] == p) { q <- mean(x[md.values:(md.values + 1)]) } else { q <- x[md.values] } q } #' @export #' @rdname weighted_mean weighted_sd <- function( x, weights = NULL, remove_na = TRUE, verbose = TRUE, ... ) { # from cov.wt if (!.are_weights(weights) || !.validate_weights(weights, verbose)) { return(stats::sd(x, na.rm = remove_na)) } # remove missings complete <- .clean_missings(x, weights, remove_na) weights1 <- complete$weights / sum(complete$weights) center <- sum(weights1 * complete$x) xc <- sqrt(weights1) * (complete$x - center) var <- (t(xc) %*% xc) / (1 - sum(weights1^2)) sqrt(as.vector(var)) } #' @export #' @rdname weighted_mean weighted_mad <- function( x, weights = NULL, constant = 1.4826, remove_na = TRUE, verbose = TRUE, ... ) { # From matrixStats if (!.are_weights(weights) || !.validate_weights(weights, verbose)) { return(stats::mad(x, na.rm = remove_na)) } center <- weighted_median(x, weights = weights, remove_na = remove_na) x <- abs(x - center) constant * weighted_median(x, weights = weights, remove_na = remove_na) } # Utils ------------------------------------------------------------------- .validate_weights <- function(weights, verbose = TRUE) { pos <- all(weights > 0, na.rm = TRUE) if (isTRUE(!pos) && isTRUE(verbose)) { insight::format_warning( "Some `weights` were negative. Weighting not carried out." ) } pos } .clean_missings <- function(x, weights, remove_na) { if (isTRUE(remove_na)) { flag <- FALSE if (any(is.infinite(x)) || any(is.infinite(weights))) { # remove Inf x[is.infinite(x)] <- NA weights[is.infinite(weights)] <- NA flag <- TRUE } if (anyNA(x) || anyNA(weights)) { # remove missings x[is.na(weights)] <- NA weights[is.na(x)] <- NA flag <- TRUE } if (flag) { weights <- stats::na.omit(weights) x <- stats::na.omit(x) } } list(x = x, weights = weights) } ================================================ FILE: R/winsorize.R ================================================ #' Winsorize data #' #' @details #' #' Winsorizing or winsorization is the transformation of statistics by limiting #' extreme values in the statistical data to reduce the effect of possibly #' spurious outliers. The distribution of many statistics can be heavily #' influenced by outliers. A typical strategy is to set all outliers (values #' beyond a certain threshold) to a specified percentile of the data; for #' example, a `90%` winsorization would see all data below the 5th percentile set #' to the 5th percentile, and data above the 95th percentile set to the 95th #' percentile. Winsorized estimators are usually more robust to outliers than #' their more standard forms. #' #' @return #' #' A data frame with winsorized columns or a winsorized vector. #' #' @param data data frame or vector. #' @param threshold The amount of winsorization, depends on the value of `method`: #' - For `method = "percentile"`: the amount to winsorize from *each* tail. #' The value of `threshold` must be between 0 and 0.5 and of length 1. #' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the #' *mean*/*median* (see `robust`). The value of `threshold` must be greater #' than 0 and of length 1. #' - For `method = "raw"`: a vector of length 2 with the lower and upper bound #' for winsorization. #' @param method One of "percentile" (default), "zscore", or "raw". #' @param robust Logical, if TRUE, winsorizing through the "zscore" method is #' done via the median and the median absolute deviation (MAD); if FALSE, via #' the mean and the standard deviation. #' @param ... Currently not used. #' @param verbose Not used anymore since `datawizard` 0.6.6. #' #' @examples #' hist(iris$Sepal.Length, main = "Original data") #' #' hist(winsorize(iris$Sepal.Length, threshold = 0.2), #' xlim = c(4, 8), main = "Percentile Winsorization" #' ) #' #' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), #' xlim = c(4, 8), main = "Mean (+/- SD) Winsorization" #' ) #' #' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), #' xlim = c(4, 8), main = "Median (+/- MAD) Winsorization" #' ) #' #' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), #' xlim = c(4, 8), main = "Raw Thresholds" #' ) #' #' # Also works on a data frame: #' winsorize(iris, threshold = 0.2) #' #' @inherit data_rename seealso #' @export winsorize <- function(data, ...) { UseMethod("winsorize") } #' @export winsorize.factor <- function(data, ...) { data } #' @export winsorize.character <- winsorize.factor #' @export winsorize.logical <- winsorize.factor #' @export winsorize.data.frame <- function( data, threshold = 0.2, method = "percentile", robust = FALSE, verbose = TRUE, ... ) { data[] <- lapply( data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose ) data } #' @rdname winsorize #' @export winsorize.numeric <- function( data, threshold = 0.2, method = "percentile", robust = FALSE, verbose = TRUE, ... ) { method <- match.arg(method, choices = c("percentile", "zscore", "raw")) if (method == "raw" && length(threshold) != 2L) { insight::format_error( "`threshold` must be of length 2 for lower and upper bound." ) } if (method == "percentile") { if (threshold < 0 || threshold > 0.5) { insight::format_error( "`threshold` for winsorization must be a scalar between 0 and 0.5." ) } y <- sort(data) n <- length(data) ibot <- floor(threshold * n) + 1 itop <- length(data) - ibot + 1 threshold <- c(y[ibot], y[itop]) } if (method == "zscore") { if (threshold <= 0) { insight::format_error( "'threshold' for winsorization must be a scalar greater than 0." ) } if (isTRUE(robust)) { centeral <- stats::median(data, na.rm = TRUE) deviation <- stats::mad(data, center = centeral, na.rm = TRUE) } else { centeral <- mean(data, na.rm = TRUE) deviation <- stats::sd(data, na.rm = TRUE) } threshold <- centeral + c(-1, 1) * deviation * threshold } data[data < threshold[1]] <- threshold[1] data[data > threshold[2]] <- threshold[2] return(data) } ================================================ FILE: README.Rmd ================================================ --- output: github_document --- # `datawizard`: Easy Data Wrangling and Statistical Transformations ```{r, echo=FALSE, warning=FALSE, message=FALSE} knitr::opts_chunk$set( collapse = TRUE, dpi = 300, out.width = "100%", fig.path = "man/figures/", comment = "#>" ) set.seed(333) library(datawizard) ``` [![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684) [![downloads](https://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard) [![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) `{datawizard}` is a lightweight package to easily manipulate, clean, transform, and prepare your data for analysis. It is part of the [easystats ecosystem](https://easystats.github.io/easystats/), a suite of R packages to deal with your entire statistical analysis, from cleaning the data to reporting the results. It covers two aspects of data preparation: - **Data manipulation**: `{datawizard}` offers a very similar set of functions to that of the *tidyverse* packages, such as a `{dplyr}` and `{tidyr}`, to select, filter and reshape data, with a few key differences. 1) All data manipulation functions start with the prefix `data_*` (which makes them easy to identify). 2) Although most functions can be used exactly as their *tidyverse* equivalents, they are also string-friendly (which makes them easy to program with and use inside functions). Finally, `{datawizard}` is super lightweight (no dependencies, similar to [poorman](https://github.com/nathaneastwood/poorman)), which makes it awesome for developers to use in their packages. - **Statistical transformations**: `{datawizard}` also has powerful functions to easily apply common data [transformations](https://easystats.github.io/datawizard/reference/index.html#statistical-transformations), including standardization, normalization, rescaling, rank-transformation, scale reversing, recoding, binning, etc.

# Installation [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/datawizard)](https://cran.r-project.org/package=datawizard) [![datawizard status badge](https://easystats.r-universe.dev/badges/datawizard)](https://easystats.r-universe.dev) [![codecov](https://codecov.io/gh/easystats/datawizard/branch/main/graph/badge.svg)](https://app.codecov.io/gh/easystats/datawizard) [![R-CMD-check](https://github.com/easystats/datawizard/workflows/R-CMD-check/badge.svg?branch=main)](https://github.com/easystats/datawizard/actions) Type | Source | Command ---|---|--- Release | CRAN | `install.packages("datawizard")` Development | r-universe | `install.packages("datawizard", repos = "https://easystats.r-universe.dev")` Development | GitHub | `remotes::install_github("easystats/datawizard")` > **Tip** > > **Instead of `library(datawizard)`, use `library(easystats)`.** > **This will make all features of the easystats-ecosystem available.** > > **To stay updated, use `easystats::install_latest()`.** # Citation To cite the package, run the following command: ```{r, comment=""} citation("datawizard") ``` # Features [![Documentation](https://img.shields.io/badge/documentation-datawizard-orange.svg?colorB=E91E63)](https://easystats.github.io/datawizard/) [![Blog](https://img.shields.io/badge/blog-easystats-orange.svg?colorB=FF9800)](https://easystats.github.io/blog/posts/) [![Features](https://img.shields.io/badge/features-datawizard-orange.svg?colorB=2196F3)](https://easystats.github.io/datawizard/reference/index.html) Most courses and tutorials about statistical modeling assume that you are working with a clean and tidy dataset. In practice, however, a major part of doing statistical modeling is preparing your data--cleaning up values, creating new columns, reshaping the dataset, or transforming some variables. `{datawizard}` provides easy to use tools to perform these common, critical, and sometimes tedious data preparation tasks. ## Data wrangling ### Select, filter and remove variables The package provides helpers to filter rows meeting certain conditions... ```{r} data_match(mtcars, data.frame(vs = 0, am = 1)) ``` ... or logical expressions: ```{r} data_filter(mtcars, vs == 0 & am == 1) ``` Finding columns in a data frame, or retrieving the data of selected columns, can be achieved using `extract_column_names()` or `data_select()`: ```{r} # find column names matching a pattern extract_column_names(iris, starts_with("Sepal")) # return data columns matching a pattern data_select(iris, starts_with("Sepal")) |> head() ``` It is also possible to extract one or more variables: ```{r} # single variable data_extract(mtcars, "gear") # more variables head(data_extract(iris, ends_with("Width"))) ``` Due to the consistent API, removing variables is just as simple: ```{r} head(data_remove(iris, starts_with("Sepal"))) ``` ### Reorder or rename ```{r} head(data_relocate(iris, select = "Species", before = "Sepal.Length")) ``` ```{r} head(data_rename(iris, c("Sepal.Length", "Sepal.Width"), c("length", "width"))) ``` ### Merge ```{r} x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3) y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4) x y data_merge(x, y, join = "full") data_merge(x, y, join = "left") data_merge(x, y, join = "right") data_merge(x, y, join = "semi", by = "c") data_merge(x, y, join = "anti", by = "c") data_merge(x, y, join = "inner") data_merge(x, y, join = "bind") ``` ### Reshape A common data wrangling task is to reshape data. Either to go from wide/Cartesian to long/tidy format ```{r} wide_data <- data.frame(replicate(5, rnorm(10))) head(data_to_long(wide_data)) ``` or the other way ```{r} long_data <- data_to_long(wide_data, rows_to = "Row_ID") # Save row number data_to_wide(long_data, names_from = "name", values_from = "value", id_cols = "Row_ID" ) ``` ### Empty rows and columns ```{r} tmp <- data.frame( a = c(1, 2, 3, NA, 5), b = c(1, NA, 3, NA, 5), c = c(NA, NA, NA, NA, NA), d = c(1, NA, 3, NA, 5) ) tmp # indices of empty columns or rows empty_columns(tmp) empty_rows(tmp) # remove empty columns or rows remove_empty_columns(tmp) remove_empty_rows(tmp) # remove empty columns and rows remove_empty(tmp) ``` ### Recode or cut dataframe ```{r} set.seed(123) x <- sample(1:10, size = 50, replace = TRUE) table(x) # cut into 3 groups, based on distribution (quantiles) table(categorize(x, split = "quantile", n_groups = 3)) ``` ## Data Transformations The packages also contains multiple functions to help transform data. ### Standardize For example, to standardize (*z*-score) data: ```{r} # before summary(swiss) # after summary(standardize(swiss)) ``` ### Winsorize To winsorize data: ```{r} # before anscombe # after winsorize(anscombe) ``` ### Center To grand-mean center data ```{r} center(anscombe) ``` ### Ranktransform To rank-transform data: ```{r} # before head(trees) # after head(ranktransform(trees)) ``` ### Rescale To rescale a numeric variable to a new range: ```{r} change_scale(c(0, 1, 5, -5, -2)) ``` ### Rotate or transpose ```{r} x <- mtcars[1:3, 1:4] x data_rotate(x) ``` ## Data properties `datawizard` provides a way to provide comprehensive descriptive summary for all variables in a dataframe: ```{r} data(iris) describe_distribution(iris) ``` Or even just a variable ```{r} describe_distribution(mtcars$wt) ``` There are also some additional data properties that can be computed using this package. ```{r} x <- (-10:10)^3 + rnorm(21, 0, 100) smoothness(x, method = "diff") ``` ## Function design and pipe-workflow The design of the `{datawizard}` functions follows a design principle that makes it easy for user to understand and remember how functions work: 1. the first argument is the data 2. for methods that work on data frames, two arguments are following to `select` and `exclude` variables 3. the following arguments are arguments related to the specific tasks of the functions Most important, functions that accept data frames usually have this as their first argument, and also return a (modified) data frame again. Thus, `{datawizard}` integrates smoothly into a "pipe-workflow". ```{r} iris |> # all rows where Species is "versicolor" or "virginica" data_filter(Species %in% c("versicolor", "virginica")) |> # select only columns with "." in names (i.e. drop Species) data_select(contains("\\.")) |> # move columns that ends with "Length" to start of data frame data_relocate(ends_with("Length")) |> # remove fourth column data_remove(4) |> head() ``` # Contributing and Support In case you want to file an issue or contribute in another way to the package, please follow [this guide](https://easystats.github.io/datawizard/CONTRIBUTING.html). For questions about the functionality, you may either contact us via email or also file an issue. # Code of Conduct Please note that this project is released with a [Contributor Code of Conduct](https://easystats.github.io/datawizard/CODE_OF_CONDUCT.html). By participating in this project you agree to abide by its terms. ================================================ FILE: README.md ================================================ # `datawizard`: Easy Data Wrangling and Statistical Transformations [![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684) [![downloads](https://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard) [![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) `{datawizard}` is a lightweight package to easily manipulate, clean, transform, and prepare your data for analysis. It is part of the [easystats ecosystem](https://easystats.github.io/easystats/), a suite of R packages to deal with your entire statistical analysis, from cleaning the data to reporting the results. It covers two aspects of data preparation: - **Data manipulation**: `{datawizard}` offers a very similar set of functions to that of the *tidyverse* packages, such as a `{dplyr}` and `{tidyr}`, to select, filter and reshape data, with a few key differences. 1) All data manipulation functions start with the prefix `data_*` (which makes them easy to identify). 2) Although most functions can be used exactly as their *tidyverse* equivalents, they are also string-friendly (which makes them easy to program with and use inside functions). Finally, `{datawizard}` is super lightweight (no dependencies, similar to [poorman](https://github.com/nathaneastwood/poorman)), which makes it awesome for developers to use in their packages. - **Statistical transformations**: `{datawizard}` also has powerful functions to easily apply common data [transformations](https://easystats.github.io/datawizard/reference/index.html#statistical-transformations), including standardization, normalization, rescaling, rank-transformation, scale reversing, recoding, binning, etc.

# Installation [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/datawizard)](https://cran.r-project.org/package=datawizard) [![datawizard status badge](https://easystats.r-universe.dev/badges/datawizard)](https://easystats.r-universe.dev) [![codecov](https://codecov.io/gh/easystats/datawizard/branch/main/graph/badge.svg)](https://app.codecov.io/gh/easystats/datawizard) [![R-CMD-check](https://github.com/easystats/datawizard/workflows/R-CMD-check/badge.svg?branch=main)](https://github.com/easystats/datawizard/actions) | Type | Source | Command | |----|----|----| | Release | CRAN | `install.packages("datawizard")` | | Development | r-universe | `install.packages("datawizard", repos = "https://easystats.r-universe.dev")` | | Development | GitHub | `remotes::install_github("easystats/datawizard")` | > **Tip** > > **Instead of `library(datawizard)`, use `library(easystats)`.** **This > will make all features of the easystats-ecosystem available.** > > **To stay updated, use `easystats::install_latest()`.** # Citation To cite the package, run the following command: ``` r citation("datawizard") To cite package 'datawizard' in publications use: Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. Journal of Open Source Software, 7(78), 4684, https://doi.org/10.21105/joss.04684 A BibTeX entry for LaTeX users is @Article{, title = {{datawizard}: An {R} Package for Easy Data Preparation and Statistical Transformations}, author = {Indrajeet Patil and Dominique Makowski and Mattan S. Ben-Shachar and Brenton M. Wiernik and Etienne Bacher and Daniel Lüdecke}, journal = {Journal of Open Source Software}, year = {2022}, volume = {7}, number = {78}, pages = {4684}, doi = {10.21105/joss.04684}, } ``` # Features [![Documentation](https://img.shields.io/badge/documentation-datawizard-orange.svg?colorB=E91E63)](https://easystats.github.io/datawizard/) [![Blog](https://img.shields.io/badge/blog-easystats-orange.svg?colorB=FF9800)](https://easystats.github.io/blog/posts/) [![Features](https://img.shields.io/badge/features-datawizard-orange.svg?colorB=2196F3)](https://easystats.github.io/datawizard/reference/index.html) Most courses and tutorials about statistical modeling assume that you are working with a clean and tidy dataset. In practice, however, a major part of doing statistical modeling is preparing your data–cleaning up values, creating new columns, reshaping the dataset, or transforming some variables. `{datawizard}` provides easy to use tools to perform these common, critical, and sometimes tedious data preparation tasks. ## Data wrangling ### Select, filter and remove variables The package provides helpers to filter rows meeting certain conditions… ``` r data_match(mtcars, data.frame(vs = 0, am = 1)) #> mpg cyl disp hp drat wt qsec vs am gear carb #> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 #> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 #> Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 #> Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 #> Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 #> Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 ``` … or logical expressions: ``` r data_filter(mtcars, vs == 0 & am == 1) #> mpg cyl disp hp drat wt qsec vs am gear carb #> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 #> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 #> Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 #> Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 #> Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 #> Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 ``` Finding columns in a data frame, or retrieving the data of selected columns, can be achieved using `extract_column_names()` or `data_select()`: ``` r # find column names matching a pattern extract_column_names(iris, starts_with("Sepal")) #> [1] "Sepal.Length" "Sepal.Width" # return data columns matching a pattern data_select(iris, starts_with("Sepal")) |> head() #> Sepal.Length Sepal.Width #> 1 5.1 3.5 #> 2 4.9 3.0 #> 3 4.7 3.2 #> 4 4.6 3.1 #> 5 5.0 3.6 #> 6 5.4 3.9 ``` It is also possible to extract one or more variables: ``` r # single variable data_extract(mtcars, "gear") #> [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4 # more variables head(data_extract(iris, ends_with("Width"))) #> Sepal.Width Petal.Width #> 1 3.5 0.2 #> 2 3.0 0.2 #> 3 3.2 0.2 #> 4 3.1 0.2 #> 5 3.6 0.2 #> 6 3.9 0.4 ``` Due to the consistent API, removing variables is just as simple: ``` r head(data_remove(iris, starts_with("Sepal"))) #> Petal.Length Petal.Width Species #> 1 1.4 0.2 setosa #> 2 1.4 0.2 setosa #> 3 1.3 0.2 setosa #> 4 1.5 0.2 setosa #> 5 1.4 0.2 setosa #> 6 1.7 0.4 setosa ``` ### Reorder or rename ``` r head(data_relocate(iris, select = "Species", before = "Sepal.Length")) #> Species Sepal.Length Sepal.Width Petal.Length Petal.Width #> 1 setosa 5.1 3.5 1.4 0.2 #> 2 setosa 4.9 3.0 1.4 0.2 #> 3 setosa 4.7 3.2 1.3 0.2 #> 4 setosa 4.6 3.1 1.5 0.2 #> 5 setosa 5.0 3.6 1.4 0.2 #> 6 setosa 5.4 3.9 1.7 0.4 ``` ``` r head(data_rename(iris, c("Sepal.Length", "Sepal.Width"), c("length", "width"))) #> length width Petal.Length Petal.Width Species #> 1 5.1 3.5 1.4 0.2 setosa #> 2 4.9 3.0 1.4 0.2 setosa #> 3 4.7 3.2 1.3 0.2 setosa #> 4 4.6 3.1 1.5 0.2 setosa #> 5 5.0 3.6 1.4 0.2 setosa #> 6 5.4 3.9 1.7 0.4 setosa ``` ### Merge ``` r x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3) y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4) x #> a b c id #> 1 1 a 5 1 #> 2 2 b 6 2 #> 3 3 c 7 3 y #> c d e id #> 1 6 f 100 2 #> 2 7 g 101 3 #> 3 8 h 102 4 data_merge(x, y, join = "full") #> a b c id d e #> 3 1 a 5 1 NA #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 #> 4 NA 8 4 h 102 data_merge(x, y, join = "left") #> a b c id d e #> 3 1 a 5 1 NA #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 data_merge(x, y, join = "right") #> a b c id d e #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 #> 3 NA 8 4 h 102 data_merge(x, y, join = "semi", by = "c") #> a b c id #> 2 2 b 6 2 #> 3 3 c 7 3 data_merge(x, y, join = "anti", by = "c") #> a b c id #> 1 1 a 5 1 data_merge(x, y, join = "inner") #> a b c id d e #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 data_merge(x, y, join = "bind") #> a b c id d e #> 1 1 a 5 1 NA #> 2 2 b 6 2 NA #> 3 3 c 7 3 NA #> 4 NA 6 2 f 100 #> 5 NA 7 3 g 101 #> 6 NA 8 4 h 102 ``` ### Reshape A common data wrangling task is to reshape data. Either to go from wide/Cartesian to long/tidy format ``` r wide_data <- data.frame(replicate(5, rnorm(10))) head(data_to_long(wide_data)) #> name value #> 1 X1 -0.08281164 #> 2 X2 -1.12490028 #> 3 X3 -0.70632036 #> 4 X4 -0.70278946 #> 5 X5 0.07633326 #> 6 X1 1.93468099 ``` or the other way ``` r long_data <- data_to_long(wide_data, rows_to = "Row_ID") # Save row number data_to_wide(long_data, names_from = "name", values_from = "value", id_cols = "Row_ID" ) #> Row_ID X1 X2 X3 X4 X5 #> 1 1 -0.08281164 -1.12490028 -0.70632036 -0.7027895 0.07633326 #> 2 2 1.93468099 -0.87430362 0.96687656 0.2998642 -0.23035595 #> 3 3 -2.05128979 0.04386162 -0.71016648 1.1494697 0.31746484 #> 4 4 0.27773897 -0.58397514 -0.05917365 -0.3016415 -1.59268440 #> 5 5 -1.52596060 -0.82329858 -0.23094342 -0.5473394 -0.18194062 #> 6 6 -0.26916362 0.11059280 0.69200045 -0.3854041 1.75614174 #> 7 7 1.23305388 0.36472778 1.35682290 0.2763720 0.11394932 #> 8 8 0.63360774 0.05370100 1.78872284 0.1518608 -0.29216508 #> 9 9 0.35271746 1.36867235 0.41071582 -0.4313808 1.75409316 #> 10 10 -0.56048248 -0.38045724 -2.18785470 -1.8705001 1.80958455 ``` ### Empty rows and columns ``` r tmp <- data.frame( a = c(1, 2, 3, NA, 5), b = c(1, NA, 3, NA, 5), c = c(NA, NA, NA, NA, NA), d = c(1, NA, 3, NA, 5) ) tmp #> a b c d #> 1 1 1 NA 1 #> 2 2 NA NA NA #> 3 3 3 NA 3 #> 4 NA NA NA NA #> 5 5 5 NA 5 # indices of empty columns or rows empty_columns(tmp) #> c #> 3 empty_rows(tmp) #> [1] 4 # remove empty columns or rows remove_empty_columns(tmp) #> a b d #> 1 1 1 1 #> 2 2 NA NA #> 3 3 3 3 #> 4 NA NA NA #> 5 5 5 5 remove_empty_rows(tmp) #> a b c d #> 1 1 1 NA 1 #> 2 2 NA NA NA #> 3 3 3 NA 3 #> 5 5 5 NA 5 # remove empty columns and rows remove_empty(tmp) #> a b d #> 1 1 1 1 #> 2 2 NA NA #> 3 3 3 3 #> 5 5 5 5 ``` ### Recode or cut dataframe ``` r set.seed(123) x <- sample(1:10, size = 50, replace = TRUE) table(x) #> x #> 1 2 3 4 5 6 7 8 9 10 #> 2 3 5 3 7 5 5 2 11 7 # cut into 3 groups, based on distribution (quantiles) table(categorize(x, split = "quantile", n_groups = 3)) #> #> 1 2 3 #> 13 19 18 ``` ## Data Transformations The packages also contains multiple functions to help transform data. ### Standardize For example, to standardize (*z*-score) data: ``` r # before summary(swiss) #> Fertility Agriculture Examination Education #> Min. :35.00 Min. : 1.20 Min. : 3.00 Min. : 1.00 #> 1st Qu.:64.70 1st Qu.:35.90 1st Qu.:12.00 1st Qu.: 6.00 #> Median :70.40 Median :54.10 Median :16.00 Median : 8.00 #> Mean :70.14 Mean :50.66 Mean :16.49 Mean :10.98 #> 3rd Qu.:78.45 3rd Qu.:67.65 3rd Qu.:22.00 3rd Qu.:12.00 #> Max. :92.50 Max. :89.70 Max. :37.00 Max. :53.00 #> Catholic Infant.Mortality #> Min. : 2.150 Min. :10.80 #> 1st Qu.: 5.195 1st Qu.:18.15 #> Median : 15.140 Median :20.00 #> Mean : 41.144 Mean :19.94 #> 3rd Qu.: 93.125 3rd Qu.:21.70 #> Max. :100.000 Max. :26.60 # after summary(standardize(swiss)) #> Fertility Agriculture Examination Education #> Min. :-2.81327 Min. :-2.1778 Min. :-1.69084 Min. :-1.0378 #> 1st Qu.:-0.43569 1st Qu.:-0.6499 1st Qu.:-0.56273 1st Qu.:-0.5178 #> Median : 0.02061 Median : 0.1515 Median :-0.06134 Median :-0.3098 #> Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 #> 3rd Qu.: 0.66504 3rd Qu.: 0.7481 3rd Qu.: 0.69074 3rd Qu.: 0.1062 #> Max. : 1.78978 Max. : 1.7190 Max. : 2.57094 Max. : 4.3702 #> Catholic Infant.Mortality #> Min. :-0.9350 Min. :-3.13886 #> 1st Qu.:-0.8620 1st Qu.:-0.61543 #> Median :-0.6235 Median : 0.01972 #> Mean : 0.0000 Mean : 0.00000 #> 3rd Qu.: 1.2464 3rd Qu.: 0.60337 #> Max. : 1.4113 Max. : 2.28566 ``` ### Winsorize To winsorize data: ``` r # before anscombe #> x1 x2 x3 x4 y1 y2 y3 y4 #> 1 10 10 10 8 8.04 9.14 7.46 6.58 #> 2 8 8 8 8 6.95 8.14 6.77 5.76 #> 3 13 13 13 8 7.58 8.74 12.74 7.71 #> 4 9 9 9 8 8.81 8.77 7.11 8.84 #> 5 11 11 11 8 8.33 9.26 7.81 8.47 #> 6 14 14 14 8 9.96 8.10 8.84 7.04 #> 7 6 6 6 8 7.24 6.13 6.08 5.25 #> 8 4 4 4 19 4.26 3.10 5.39 12.50 #> 9 12 12 12 8 10.84 9.13 8.15 5.56 #> 10 7 7 7 8 4.82 7.26 6.42 7.91 #> 11 5 5 5 8 5.68 4.74 5.73 6.89 # after winsorize(anscombe) #> x1 x2 x3 x4 y1 y2 y3 y4 #> 1 10 10 10 8 8.04 9.13 7.46 6.58 #> 2 8 8 8 8 6.95 8.14 6.77 5.76 #> 3 12 12 12 8 7.58 8.74 8.15 7.71 #> 4 9 9 9 8 8.81 8.77 7.11 8.47 #> 5 11 11 11 8 8.33 9.13 7.81 8.47 #> 6 12 12 12 8 8.81 8.10 8.15 7.04 #> 7 6 6 6 8 7.24 6.13 6.08 5.76 #> 8 6 6 6 8 5.68 6.13 6.08 8.47 #> 9 12 12 12 8 8.81 9.13 8.15 5.76 #> 10 7 7 7 8 5.68 7.26 6.42 7.91 #> 11 6 6 6 8 5.68 6.13 6.08 6.89 ``` ### Center To grand-mean center data ``` r center(anscombe) #> x1 x2 x3 x4 y1 y2 y3 y4 #> 1 1 1 1 -1 0.53909091 1.6390909 -0.04 -0.9209091 #> 2 -1 -1 -1 -1 -0.55090909 0.6390909 -0.73 -1.7409091 #> 3 4 4 4 -1 0.07909091 1.2390909 5.24 0.2090909 #> 4 0 0 0 -1 1.30909091 1.2690909 -0.39 1.3390909 #> 5 2 2 2 -1 0.82909091 1.7590909 0.31 0.9690909 #> 6 5 5 5 -1 2.45909091 0.5990909 1.34 -0.4609091 #> 7 -3 -3 -3 -1 -0.26090909 -1.3709091 -1.42 -2.2509091 #> 8 -5 -5 -5 10 -3.24090909 -4.4009091 -2.11 4.9990909 #> 9 3 3 3 -1 3.33909091 1.6290909 0.65 -1.9409091 #> 10 -2 -2 -2 -1 -2.68090909 -0.2409091 -1.08 0.4090909 #> 11 -4 -4 -4 -1 -1.82090909 -2.7609091 -1.77 -0.6109091 ``` ### Ranktransform To rank-transform data: ``` r # before head(trees) #> Girth Height Volume #> 1 8.3 70 10.3 #> 2 8.6 65 10.3 #> 3 8.8 63 10.2 #> 4 10.5 72 16.4 #> 5 10.7 81 18.8 #> 6 10.8 83 19.7 # after head(ranktransform(trees)) #> Girth Height Volume #> 1 1 6.0 2.5 #> 2 2 3.0 2.5 #> 3 3 1.0 1.0 #> 4 4 8.5 5.0 #> 5 5 25.5 7.0 #> 6 6 28.0 9.0 ``` ### Rescale To rescale a numeric variable to a new range: ``` r change_scale(c(0, 1, 5, -5, -2)) #> [1] 50 60 100 0 30 #> (original range = -5 to 5) ``` ### Rotate or transpose ``` r x <- mtcars[1:3, 1:4] x #> mpg cyl disp hp #> Mazda RX4 21.0 6 160 110 #> Mazda RX4 Wag 21.0 6 160 110 #> Datsun 710 22.8 4 108 93 data_rotate(x) #> Mazda RX4 Mazda RX4 Wag Datsun 710 #> mpg 21 21 22.8 #> cyl 6 6 4.0 #> disp 160 160 108.0 #> hp 110 110 93.0 ``` ## Data properties `datawizard` provides a way to provide comprehensive descriptive summary for all variables in a dataframe: ``` r data(iris) describe_distribution(iris) #> Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing #> ---------------------------------------------------------------------------------------- #> Sepal.Length | 5.84 | 0.83 | 1.30 | [4.30, 7.90] | 0.31 | -0.55 | 150 | 0 #> Sepal.Width | 3.06 | 0.44 | 0.52 | [2.00, 4.40] | 0.32 | 0.23 | 150 | 0 #> Petal.Length | 3.76 | 1.77 | 3.52 | [1.00, 6.90] | -0.27 | -1.40 | 150 | 0 #> Petal.Width | 1.20 | 0.76 | 1.50 | [0.10, 2.50] | -0.10 | -1.34 | 150 | 0 ``` Or even just a variable ``` r describe_distribution(mtcars$wt) #> Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing #> ------------------------------------------------------------------------ #> 3.22 | 0.98 | 1.19 | [1.51, 5.42] | 0.47 | 0.42 | 32 | 0 ``` There are also some additional data properties that can be computed using this package. ``` r x <- (-10:10)^3 + rnorm(21, 0, 100) smoothness(x, method = "diff") #> [1] 1.791243 #> attr(,"class") #> [1] "parameters_smoothness" "numeric" ``` ## Function design and pipe-workflow The design of the `{datawizard}` functions follows a design principle that makes it easy for user to understand and remember how functions work: 1. the first argument is the data 2. for methods that work on data frames, two arguments are following to `select` and `exclude` variables 3. the following arguments are arguments related to the specific tasks of the functions Most important, functions that accept data frames usually have this as their first argument, and also return a (modified) data frame again. Thus, `{datawizard}` integrates smoothly into a “pipe-workflow”. ``` r iris |> # all rows where Species is "versicolor" or "virginica" data_filter(Species %in% c("versicolor", "virginica")) |> # select only columns with "." in names (i.e. drop Species) data_select(contains("\\.")) |> # move columns that ends with "Length" to start of data frame data_relocate(ends_with("Length")) |> # remove fourth column data_remove(4) |> head() #> Sepal.Length Petal.Length Sepal.Width #> 51 7.0 4.7 3.2 #> 52 6.4 4.5 3.2 #> 53 6.9 4.9 3.1 #> 54 5.5 4.0 2.3 #> 55 6.5 4.6 2.8 #> 56 5.7 4.5 2.8 ``` # Contributing and Support In case you want to file an issue or contribute in another way to the package, please follow [this guide](https://easystats.github.io/datawizard/CONTRIBUTING.html). For questions about the functionality, you may either contact us via email or also file an issue. # Code of Conduct Please note that this project is released with a [Contributor Code of Conduct](https://easystats.github.io/datawizard/CODE_OF_CONDUCT.html). By participating in this project you agree to abide by its terms. ================================================ FILE: air.toml ================================================ [format] line-width = 80 indent-width = 2 indent-style = "space" line-ending = "lf" persistent-line-breaks = true skip = ["tribble"] ================================================ FILE: cran-comments.md ================================================ This fixes R-devel errors reported on 2026-04-23. ================================================ FILE: datawizard.Rproj ================================================ Version: 1.0 RestoreWorkspace: No SaveWorkspace: No AlwaysSaveHistory: No EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: knitr LaTeX: XeLaTeX AutoAppendNewline: Yes StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace QuitChildProcessesOnExit: Yes DisableExecuteRprofile: Yes ================================================ FILE: datawizard.code-workspace ================================================ { "folders": [ { "path": "." } ], "launch": { "version": "0.2.0", "configurations": [ { "type": "R-Debugger", "name": "Launch R-Workspace", "request": "launch", "debugMode": "workspace", "workingDirectory": "" } ] } } ================================================ FILE: inst/CITATION ================================================ bibentry( bibtype="Article", title="{datawizard}: An {R} Package for Easy Data Preparation and Statistical Transformations", author=c(person("Indrajeet", "Patil"), person("Dominique", "Makowski"), person("Mattan S.", "Ben-Shachar"), person("Brenton M.", "Wiernik"), person("Etienne", "Bacher"), person("Daniel", "Lüdecke")), journal="Journal of Open Source Software", year = 2022, volume = 7, number = 78, pages = 4684, doi = "10.21105/joss.04684", textVersion = "Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. Journal of Open Source Software, 7(78), 4684, https://doi.org/10.21105/joss.04684" ) ================================================ FILE: inst/WORDLIST ================================================ AES Analysing Asparouhov BMC Bafumi Brincks Bulotsky CMD Carle Catran Crosstables DEPRECATIONS De Dhaliwal Disaggregating EFC EUROFAMCARE Enders Fairbrother GCM GLMM Gelman Giesecke Giesselmann Guo Heisig Herrington Hoffmann Joanes Kish Llabre Lumley MADs Mattan Minitab ORCID OpenBLAS PSU Ranktransform Routledge SDs Schaeffer Shachar Stata Tidyverse TitleCase Verkuilen Verkuilen's Winsorize Winsorized Winsorizing al behaviour behaviours brms codebook codebooks codecov crosstable crosstables csv de decrypt decrypted doi easystats effectsize endogeneity et geoms ggplot's https ing interpretability inversed joss labelled labelling leptokurtic lm lme meaned mesokurtic midhinge modelbased nanoparquet nd noLD openssl panelr partialization platykurtic poorman pre px readr readxl relevel rio rowid rstanarm sd stackexchange tailedness th tibble tibbles tidyverse unitless unstored visualisation wikipedia winsorization winsorize winsorized winsorizes winsorizing zscore ================================================ FILE: man/adjust.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/adjust.R \name{adjust} \alias{adjust} \alias{data_adjust} \title{Adjust data for the effect of other variable(s)} \usage{ adjust( data, effect = NULL, select = is.numeric, exclude = NULL, multilevel = FALSE, additive = FALSE, bayesian = FALSE, keep_intercept = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE ) data_adjust( data, effect = NULL, select = is.numeric, exclude = NULL, multilevel = FALSE, additive = FALSE, bayesian = FALSE, keep_intercept = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE ) } \arguments{ \item{data}{A data frame.} \item{effect}{Character vector of column names to be adjusted for (regressed out). If \code{NULL} (the default), all variables will be selected.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{multilevel}{If \code{TRUE}, the factors are included as random factors. Else, if \code{FALSE} (default), they are included as fixed effects in the simple regression model.} \item{additive}{If \code{TRUE}, continuous variables as included as smooth terms in additive models. The goal is to regress-out potential non-linear effects.} \item{bayesian}{If \code{TRUE}, the models are fitted under the Bayesian framework using \code{rstanarm}.} \item{keep_intercept}{If \code{FALSE} (default), the intercept of the model is re-added. This avoids the centering around 0 that happens by default when regressing out another variable (see the examples below for a visual representation of this).} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ A data frame comparable to \code{data}, with adjusted variables. } \description{ This function can be used to adjust the data for the effect of other variables present in the dataset. It is based on an underlying fitting of regressions models, allowing for quite some flexibility, such as including factors as random effects in mixed models (multilevel partialization), continuous variables as smooth terms in general additive models (non-linear partialization) and/or fitting these models under a Bayesian framework. The values returned by this function are the residuals of the regression models. Note that a regular correlation between two "adjusted" variables is equivalent to the partial correlation between them. } \examples{ \dontshow{if (all(insight::check_if_installed(c("bayestestR", "rstanarm", "gamm4"), quietly = TRUE))) withAutoprint(\{ # examplesIf} adjusted_all <- adjust(attitude) head(adjusted_all) adjusted_one <- adjust(attitude, effect = "complaints", select = "rating") head(adjusted_one) \donttest{ adjust(attitude, effect = "complaints", select = "rating", bayesian = TRUE) adjust(attitude, effect = "complaints", select = "rating", additive = TRUE) attitude$complaints_LMH <- cut(attitude$complaints, 3) adjust(attitude, effect = "complaints_LMH", select = "rating", multilevel = TRUE) } # Generate data data <- bayestestR::simulate_correlation(n = 100, r = 0.7) data$V2 <- (5 * data$V2) + 20 # Add intercept # Adjust adjusted <- adjust(data, effect = "V1", select = "V2") adjusted_icpt <- adjust(data, effect = "V1", select = "V2", keep_intercept = TRUE) # Visualize plot( data$V1, data$V2, pch = 19, col = "blue", ylim = c(min(adjusted$V2), max(data$V2)), main = "Original (blue), adjusted (green), and adjusted - intercept kept (red) data" ) abline(lm(V2 ~ V1, data = data), col = "blue") points(adjusted$V1, adjusted$V2, pch = 19, col = "green") abline(lm(V2 ~ V1, data = adjusted), col = "green") points(adjusted_icpt$V1, adjusted_icpt$V2, pch = 19, col = "red") abline(lm(V2 ~ V1, data = adjusted_icpt), col = "red") \dontshow{\}) # examplesIf} } ================================================ FILE: man/as.prop.table.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_tabulate.R \name{as.prop.table} \alias{as.prop.table} \alias{as.prop.table.datawizard_crosstab} \alias{as.data.frame.datawizard_tables} \alias{as.table.datawizard_table} \title{Convert a crosstable to a frequency or a propensity table} \usage{ as.prop.table(x, ...) \method{as.prop.table}{datawizard_crosstab}(x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ...) \method{as.data.frame}{datawizard_tables}( x, row.names = NULL, optional = FALSE, ..., stringsAsFactors = FALSE, add_total = FALSE ) \method{as.table}{datawizard_table}(x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ...) } \arguments{ \item{x}{An object created by \code{data_tabulate()}. It must be of class \code{datawizard_crosstab} for \code{as.prop.table()}.} \item{...}{not used.} \item{remove_na}{Logical, if \code{FALSE}, missing values are included in the frequency or crosstable, else missing values are omitted. Note that the default for the \code{as.table()} method is \code{remove_na = TRUE}, so that missing values are not included in the returned table, which makes more sense for post-processing of the table, e.g. using \code{chisq.test()}.} \item{simplify}{Logical, if \code{TRUE}, the returned table is simplified to a single table object if there is only one frequency or contingency table input. Else, always for multiple table inputs or when \code{simplify = FALSE}, a list of tables is returned. This is only relevant for the \code{as.table()} methods. To ensure consistent output, the default is \code{FALSE}.} \item{verbose}{Toggle warnings and messages.} \item{row.names}{\code{NULL} or a character vector giving the row names for the data frame. Missing values are not allowed.} \item{optional}{logical. If \code{TRUE}, setting row names and converting column names (to syntactic names: see \code{\link[base]{make.names}}) is optional. Note that all of \R's \pkg{base} package \code{as.data.frame()} methods use \code{optional} only for column names treatment, basically with the meaning of \code{\link[base]{data.frame}(*, check.names = !optional)}. See also the \code{make.names} argument of the \code{matrix} method.} \item{stringsAsFactors}{logical: should the character vector be converted to a factor?} \item{add_total}{For crosstables (i.e. when \code{by} is not \code{NULL}), a row and column with the total N values are added to the data frame. \code{add_total} has no effect in \code{as.data.frame()} for simple frequency tables.} } \description{ \code{as.prop.table()} is an S3 generic. It can be used on objects of class \code{datawizard_crosstab} created by \code{data_tabulate()} when it was run with the arguments \code{by} and \code{proportions}. } \examples{ data(efc) # Some cross tabulation cross <- data_tabulate(efc, select = "e42dep", by = "c172code", proportions = "row") cross # Convert to a propensity table as.prop.table(cross) # Convert to data.frame result <- data_tabulate(efc, "c172code", by = "e16sex") as.data.frame(result) as.data.frame(result)$table as.data.frame(result, add_total = TRUE)$table # Convert to a table that can be passed to chisq.test() out <- data_tabulate(efc, "c172code", by = "e16sex") # we need to simplify the output, else we get a list of tables tbl <- as.table(out, simplify = TRUE) tbl suppressWarnings(chisq.test(tbl)) # apply chisq.test to each table out <- data_tabulate(efc, c("c172code", "e16sex")) suppressWarnings(lapply(as.table(out), chisq.test)) # can also handle grouped data frames d <- data_group(mtcars, "am") x <- data_tabulate(d, "cyl", by = "gear") as.table(x) } \seealso{ \link{data_tabulate} } ================================================ FILE: man/assign_labels.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_labels.R \name{assign_labels} \alias{assign_labels} \alias{assign_labels.numeric} \alias{assign_labels.data.frame} \title{Assign variable and value labels} \usage{ assign_labels(x, ...) \method{assign_labels}{numeric}(x, variable = NULL, values = NULL, ...) \method{assign_labels}{data.frame}( x, select = NULL, exclude = NULL, values = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A data frame, factor or vector.} \item{...}{Currently not used.} \item{variable}{The variable label as string.} \item{values}{The value labels as (named) character vector. If \code{values} is \emph{not} a named vector, the length of labels must be equal to the length of unique values. For a named vector, the left-hand side (LHS) is the value in \code{x}, the right-hand side (RHS) the associated value label. Non-matching labels are omitted.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ A labelled variable, or a data frame of labelled variables. } \description{ Assign variable and values labels to a variable or variables in a data frame. Labels are stored as attributes (\code{"label"} for variable labels and \code{"labels"}) for value labels. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ x <- 1:3 # labelling by providing required number of labels assign_labels( x, variable = "My x", values = c("one", "two", "three") ) # labelling using named vectors data(iris) out <- assign_labels( iris$Species, variable = "Labelled Species", values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3") ) str(out) # data frame example out <- assign_labels( iris, select = "Species", variable = "Labelled Species", values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3") ) str(out$Species) # Partial labelling x <- 1:5 assign_labels( x, variable = "My x", values = c(`1` = "lowest", `5` = "highest") ) } ================================================ FILE: man/categorize.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/categorize.R \name{categorize} \alias{categorize} \alias{categorize.numeric} \alias{categorize.data.frame} \title{Recode (or "cut" / "bin") data into groups of values.} \usage{ categorize(x, ...) \method{categorize}{numeric}( x, split = "median", n_groups = NULL, range = NULL, lowest = 1, breaks = "exclusive", labels = NULL, verbose = TRUE, ... ) \method{categorize}{data.frame}( x, select = NULL, exclude = NULL, split = "median", n_groups = NULL, range = NULL, lowest = 1, breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A (grouped) data frame, numeric vector or factor.} \item{...}{not used.} \item{split}{Character vector, indicating at which breaks to split variables, or numeric values with values indicating breaks. If character, may be one of \code{"median"}, \code{"mean"}, \code{"quantile"}, \code{"equal_length"}, or \code{"equal_range"}. \code{"median"} or \code{"mean"} will return dichotomous variables, split at their mean or median, respectively. \code{"quantile"} and \code{"equal_length"} will split the variable into \code{n_groups} groups, where each group refers to an interval of a specific range of values. Thus, the length of each interval will be based on the number of groups. \code{"equal_range"} also splits the variable into multiple groups, however, the length of the interval is given, and the number of resulting groups (and hence, the number of breaks) will be determined by how many intervals can be generated, based on the full range of the variable.} \item{n_groups}{If \code{split} is \code{"quantile"} or \code{"equal_length"}, this defines the number of requested groups (i.e. resulting number of levels or values) for the recoded variable(s). \code{"quantile"} will define intervals based on the distribution of the variable, while \code{"equal_length"} tries to divide the range of the variable into pieces of equal length.} \item{range}{If \code{split = "equal_range"}, this defines the range of values that are recoded into a new value.} \item{lowest}{Minimum value of the recoded variable(s). If \code{NULL} (the default), for numeric variables, the minimum of the original input is preserved. For factors, the default minimum is \code{1}. For \code{split = "equal_range"}, the default minimum is always \code{1}, unless specified otherwise in \code{lowest}.} \item{breaks}{Character, indicating whether breaks for categorizing data are \code{"inclusive"} (values indicate the \emph{upper} bound of the \emph{previous} group or interval) or \code{"exclusive"} (values indicate the \emph{lower} bound of the \emph{next} group or interval to begin). Use \code{labels = "range"} to make this behaviour easier to see.} \item{labels}{Character vector of value labels. If not \code{NULL}, \code{categorize()} will returns factors instead of numeric variables, with \code{labels} used for labelling the factor levels. Can also be \code{"mean"}, \code{"median"}, \code{"range"} or \code{"observed"} for a factor with labels as the mean/median, the requested range (even if not all values of that range are present in the data) or observed range (range of the actual recoded values) of each group. See 'Examples'.} \item{verbose}{Toggle warnings.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{append}{Logical or string. If \code{TRUE}, recoded or converted variables get new column names and are appended (column bind) to \code{x}, thus returning both the original and the recoded variables. The new columns get a suffix, based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for \code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for \code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be overwritten by their recoded versions. If a character value, recoded variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ \code{x}, recoded into groups. By default \code{x} is numeric, unless \code{labels} is specified. In this case, a factor is returned, where the factor levels (i.e. recoded groups are labelled accordingly. } \description{ This functions divides the range of variables into intervals and recodes the values inside these intervals according to their related interval. It is basically a wrapper around base R's \code{cut()}, providing a simplified and more accessible way to define the interval breaks (cut-off values). } \section{Splits and breaks (cut-off values)}{ Breaks are by default \emph{exclusive}, this means that these values indicate the lower bound of the next group or interval to begin. Take a simple example, a numeric variable with values from 1 to 9. The median would be 5, thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 would turn into 2 (compare \code{cbind(1:9, categorize(1:9))}). The same variable, using \code{split = "quantile"} and \code{n_groups = 3} would define breaks at 3.67 and 6.33 (see \code{quantile(1:9, probs = c(1/3, 2/3))}), which means that values from 1 to 3 belong to the first interval and are recoded into 1 (because the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. The opposite behaviour can be achieved using \code{breaks = "inclusive"}, in which case } \section{Recoding into groups with equal size or range}{ \code{split = "equal_length"} and \code{split = "equal_range"} try to divide the range of \code{x} into intervals of similar (or same) length. The difference is that \code{split = "equal_length"} will divide the range of \code{x} into \code{n_groups} pieces and thereby defining the intervals used as breaks (hence, it is equivalent to \code{cut(x, breaks = n_groups)}), while \code{split = "equal_range"} will cut \code{x} into intervals that all have the length of \code{range}, where the first interval by defaults starts at \code{1}. The lowest (or starting) value of that interval can be defined using the \code{lowest} argument. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ set.seed(123) x <- sample(1:10, size = 50, replace = TRUE) table(x) # by default, at median table(categorize(x)) # into 3 groups, based on distribution (quantiles) table(categorize(x, split = "quantile", n_groups = 3)) # into 3 groups, user-defined break table(categorize(x, split = c(3, 5))) set.seed(123) x <- sample(1:100, size = 500, replace = TRUE) # into 5 groups, try to recode into intervals of similar length, # i.e. the range within groups is the same for all groups table(categorize(x, split = "equal_length", n_groups = 5)) # into 5 groups, try to return same range within groups # i.e. 1-20, 21-40, 41-60, etc. Since the range of "x" is # 1-100, and we have a range of 20, this results into 5 # groups, and thus is for this particular case identical # to the previous result. table(categorize(x, split = "equal_range", range = 20)) # return factor with value labels instead of numeric value set.seed(123) x <- sample(1:10, size = 30, replace = TRUE) categorize(x, "equal_length", n_groups = 3) categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")) # cut numeric into groups with the mean or median as a label name x <- sample(1:10, size = 30, replace = TRUE) categorize(x, "equal_length", n_groups = 3, labels = "mean") categorize(x, "equal_length", n_groups = 3, labels = "median") # cut numeric into groups with the requested range as a label name # each category has the same range, and labels indicate this range categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") # in this example, each category has the same range, but labels only refer # to the ranges of the actual values (present in the data) inside each group categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/center.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/center.R \name{center} \alias{center} \alias{centre} \alias{center.numeric} \alias{center.data.frame} \title{Centering (Grand-Mean Centering)} \usage{ center(x, ...) centre(x, ...) \method{center}{numeric}( x, robust = FALSE, weights = NULL, reference = NULL, center = NULL, verbose = TRUE, ... ) \method{center}{data.frame}( x, select = NULL, exclude = NULL, robust = FALSE, weights = NULL, reference = NULL, center = NULL, force = FALSE, remove_na = c("none", "selected", "all"), append = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) } \arguments{ \item{x}{A (grouped) data frame, a (numeric or character) vector or a factor.} \item{...}{Currently not used.} \item{robust}{Logical, if \code{TRUE}, centering is done by subtracting the median from the variables. If \code{FALSE}, variables are centered by subtracting the mean.} \item{weights}{Can be \code{NULL} (for no weighting), or: \itemize{ \item For data frames: a numeric vector of weights, or a character of the name of a column in the \code{data.frame} that contains the weights. \item For numeric vectors: a numeric vector of weights. }} \item{reference}{A data frame or variable from which the centrality and deviation will be computed instead of from the input variable. Useful for standardizing a subset or new data according to another data frame.} \item{center}{Numeric value, which can be used as alternative to \code{reference} to define a reference centrality. If \code{center} is of length 1, it will be recycled to match the length of selected variables for centering. Else, \code{center} must be of same length as the number of selected variables. Values in \code{center} will be matched to selected variables in the provided order, unless a named vector is given. In this case, names are matched against the names of the selected variables.} \item{verbose}{Toggle warnings and messages.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{force}{Logical, if \code{TRUE}, forces centering of factors as well. Factors are converted to numerical values, with the lowest level being the value \code{1} (unless the factor has numeric levels, which are converted to the corresponding numeric value).} \item{remove_na}{How should missing values (\code{NA}) be treated: if \code{"none"} (default): each column's standardization is done separately, ignoring \code{NA}s. Else, rows with \code{NA} in the columns selected with \code{select} / \code{exclude} (\code{"selected"}) or in all columns (\code{"all"}) are dropped before standardization, and the resulting data frame does not include these cases.} \item{append}{Logical or string. If \code{TRUE}, centered variables get new column names (with the suffix \code{"_c"}) and are appended (column bind) to \code{x}, thus returning both the original and the centered variables. If \code{FALSE}, original variables in \code{x} will be overwritten by their centered versions. If a character value, centered variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ The centered variables. } \description{ Performs a grand-mean centering of data. } \note{ \strong{Difference between centering and standardizing}: Standardized variables are computed by subtracting the mean of the variable and then dividing it by the standard deviation, while centering variables involves only the subtraction. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ data(iris) # entire data frame or a vector head(iris$Sepal.Width) head(center(iris$Sepal.Width)) head(center(iris)) head(center(iris, force = TRUE)) # only the selected columns from a data frame center(anscombe, select = c("x1", "x3")) center(anscombe, exclude = c("x1", "x3")) # centering with reference center and scale d <- data.frame( a = c(-2, -1, 0, 1, 2), b = c(3, 4, 5, 6, 7) ) # default centering at mean center(d) # centering, using 0 as mean center(d, center = 0) # centering, using -5 as mean center(d, center = -5) } \seealso{ If centering within-clusters (instead of grand-mean centering) is required, see \code{\link[=demean]{demean()}}. For standardizing, see \code{\link[=standardize]{standardize()}}, and \code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas. } ================================================ FILE: man/coef_var.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/descriptives.R \name{coef_var} \alias{coef_var} \alias{distribution_cv} \alias{distribution_coef_var} \alias{coef_var.numeric} \title{Compute the coefficient of variation} \usage{ coef_var(x, ...) distribution_coef_var(x, ...) \method{coef_var}{numeric}( x, mu = NULL, sigma = NULL, method = c("standard", "unbiased", "median_mad", "qcd"), trim = 0, remove_na = FALSE, n = NULL, ... ) } \arguments{ \item{x}{A numeric vector of ratio scale (see details), or vector of values than can be coerced to one.} \item{...}{Further arguments passed to computation functions.} \item{mu}{A numeric vector of mean values to use to compute the coefficient of variation. If supplied, \code{x} is not used to compute the mean.} \item{sigma}{A numeric vector of standard deviation values to use to compute the coefficient of variation. If supplied, \code{x} is not used to compute the SD.} \item{method}{Method to use to compute the CV. Can be \code{"standard"} to compute by dividing the standard deviation by the mean, \code{"unbiased"} for the unbiased estimator for normally distributed data, or one of two robust alternatives: \code{"median_mad"} to divide the median by the \code{\link[stats:mad]{stats::mad()}}, or \code{"qcd"} (quartile coefficient of dispersion, interquartile range divided by the sum of the quartiles [twice the midhinge]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}.} \item{trim}{the fraction (0 to 0.5) of values to be trimmed from each end of \code{x} before the mean and standard deviation (or other measures) are computed. Values of \code{trim} outside the range of (0 to 0.5) are taken as the nearest endpoint.} \item{remove_na}{Logical. Should \code{NA} values be removed before computing (\code{TRUE}) or not (\code{FALSE}, default)?} \item{n}{If \code{method = "unbiased"} and both \code{mu} and \code{sigma} are provided (not computed from \code{x}), what sample size to use to adjust the computed CV for small-sample bias?} } \value{ The computed coefficient of variation for \code{x}. } \description{ Compute the coefficient of variation (CV, ratio of the standard deviation to the mean, \eqn{\sigma/\mu}) for a set of numeric values. } \details{ CV is only applicable of values taken on a ratio scale: values that have a \emph{fixed} meaningfully defined 0 (which is either the lowest or highest possible value), and that ratios between them are interpretable For example, how many sandwiches have I eaten this week? 0 means "none" and 20 sandwiches is 4 times more than 5 sandwiches. If I were to center the number of sandwiches, it will no longer be on a ratio scale (0 is no "none" it is the mean, and the ratio between 4 and -2 is not meaningful). Scaling a ratio scale still results in a ratio scale. So I can re define "how many half sandwiches did I eat this week ( = sandwiches * 0.5) and 0 would still mean "none", and 20 half-sandwiches is still 4 times more than 5 half-sandwiches. This means that CV is \strong{NOT} invariant to shifting, but it is to scaling: \if{html}{\out{
}}\preformatted{sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7) coef_var(sandwiches) #> [1] 1.239094 coef_var(sandwiches / 2) # same #> [1] 1.239094 coef_var(sandwiches + 4) # different! 0 is no longer meaningful! #> [1] 0.6290784 }\if{html}{\out{
}} } \examples{ coef_var(1:10) coef_var(c(1:10, 100), method = "median_mad") coef_var(c(1:10, 100), method = "qcd") coef_var(mu = 10, sigma = 20) coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30) } ================================================ FILE: man/coerce_to_numeric.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/to_numeric.R \name{coerce_to_numeric} \alias{coerce_to_numeric} \title{Convert to Numeric (if possible)} \usage{ coerce_to_numeric(x) } \arguments{ \item{x}{A vector to be converted.} } \value{ Numeric vector (if possible) } \description{ Tries to convert vector to numeric if possible (if no warnings or errors). Otherwise, leaves it as is. } \examples{ coerce_to_numeric(c("1", "2")) coerce_to_numeric(c("1", "2", "A")) } ================================================ FILE: man/colnames.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-cols.R \name{row_to_colnames} \alias{row_to_colnames} \alias{colnames_to_row} \title{Tools for working with column names} \usage{ row_to_colnames(x, row = 1, na_prefix = "x", verbose = TRUE) colnames_to_row(x, prefix = "x") } \arguments{ \item{x}{A data frame.} \item{row}{Row to use as column names.} \item{na_prefix}{Prefix to give to the column name if the row has an \code{NA}. Default is 'x', and it will be incremented at each \code{NA} (\code{x1}, \code{x2}, etc.).} \item{verbose}{Toggle warnings.} \item{prefix}{Prefix to give to the column name. Default is 'x', and it will be incremented at each column (\code{x1}, \code{x2}, etc.).} } \value{ \code{row_to_colnames()} and \code{colnames_to_row()} both return a data frame. } \description{ Tools for working with column names } \examples{ # Convert a row to column names -------------------------------- test <- data.frame( a = c("iso", 2, 5), b = c("year", 3, 6), c = c("value", 5, 7) ) test row_to_colnames(test) # Convert column names to row -------------------------------- test <- data.frame( ARG = c("BRA", "FRA"), `1960` = c(1960, 1960), `2000` = c(2000, 2000) ) test colnames_to_row(test) } ================================================ FILE: man/contr.deviation.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/contrs.R \name{contr.deviation} \alias{contr.deviation} \title{Deviation Contrast Matrix} \usage{ contr.deviation(n, base = 1, contrasts = TRUE, sparse = FALSE) } \arguments{ \item{n}{a vector of levels for a factor, or the number of levels.} \item{base}{an integer specifying which group is considered the baseline group. Ignored if \code{contrasts} is \code{FALSE}.} \item{contrasts}{a logical indicating whether contrasts should be computed.} \item{sparse}{logical indicating if the result should be sparse (of class \code{\link[Matrix:dgCMatrix-class]{dgCMatrix}}), using package \href{https://CRAN.R-project.org/package=Matrix}{\pkg{Matrix}}.} } \description{ Build a deviation contrast matrix, a type of \emph{effects contrast} matrix. } \details{ In effects coding, unlike treatment/dummy coding (\code{\link[stats:contrast]{stats::contr.treatment()}}), each contrast sums to 0. In regressions models, this results in an intercept that represents the (unweighted) average of the group means. In ANOVA settings, this also guarantees that lower order effects represent \emph{main} effects (and not \emph{simple} or \emph{conditional} effects, as is the case when using R's default \code{\link[stats:contrast]{stats::contr.treatment()}}). \cr\cr Deviation coding (\code{contr.deviation}) is a type of effects coding. With deviation coding, the coefficients for factor variables are interpreted as the difference of each factor level from the base level (this is the same interpretation as with treatment/dummy coding). For example, for a factor \code{group} with levels "A", "B", and "C", with \code{contr.devation}, the intercept represents the overall mean (average of the group means for the 3 groups), and the coefficients \code{groupB} and \code{groupC} represent the differences between the A group mean and the B and C group means, respectively. \cr\cr Sum coding (\code{\link[stats:contrast]{stats::contr.sum()}}) is another type of effects coding. With sum coding, the coefficients for factor variables are interpreted as the difference of each factor level from \strong{the grand (across-groups) mean}. For example, for a factor \code{group} with levels "A", "B", and "C", with \code{contr.sum}, the intercept represents the overall mean (average of the group means for the 3 groups), and the coefficients \code{group1} and \code{group2} represent the differences the \strong{A} and \strong{B} group means from the overall mean, respectively. } \examples{ \dontshow{if (!identical(Sys.getenv("IN_PKGDOWN"), "true")) withAutoprint(\{ # examplesIf} \donttest{ data("mtcars") mtcars <- data_modify(mtcars, cyl = factor(cyl)) c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl)) solve(c.treatment) #> 4 6 8 #> Intercept 1 0 0 # mean of the 1st level #> 6 -1 1 0 # 2nd level - 1st level #> 8 -1 0 1 # 3rd level - 1st level contrasts(mtcars$cyl) <- contr.sum c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl)) solve(c.sum) #> 4 6 8 #> Intercept 0.333 0.333 0.333 # overall mean #> 0.667 -0.333 -0.333 # deviation of 1st from overall mean #> -0.333 0.667 -0.333 # deviation of 2nd from overall mean contrasts(mtcars$cyl) <- contr.deviation c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl)) solve(c.deviation) #> 4 6 8 #> Intercept 0.333 0.333 0.333 # overall mean #> 6 -1.000 1.000 0.000 # 2nd level - 1st level #> 8 -1.000 0.000 1.000 # 3rd level - 1st level ## With Interactions ----------------------------------------- mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation)) mtcars <- data_arrange(mtcars, select = c("cyl", "am")) mm <- unique(model.matrix(~ cyl * am, data = mtcars)) rownames(mm) <- c( "cyl4.am0", "cyl4.am1", "cyl6.am0", "cyl6.am1", "cyl8.am0", "cyl8.am1" ) solve(mm) #> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1 #> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean #> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st #> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st #> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff #> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000 #> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000 } \dontshow{\}) # examplesIf} } \seealso{ \code{\link[stats:contrast]{stats::contr.sum()}} } ================================================ FILE: man/convert_na_to.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/convert_na_to.R \name{convert_na_to} \alias{convert_na_to} \alias{convert_na_to.numeric} \alias{convert_na_to.character} \alias{convert_na_to.data.frame} \title{Replace missing values in a variable or a data frame.} \usage{ convert_na_to(x, ...) \method{convert_na_to}{numeric}(x, replacement = NULL, verbose = TRUE, ...) \method{convert_na_to}{character}(x, replacement = NULL, verbose = TRUE, ...) \method{convert_na_to}{data.frame}( x, select = NULL, exclude = NULL, replacement = NULL, replace_num = replacement, replace_char = replacement, replace_fac = replacement, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A numeric, factor, or character vector, or a data frame.} \item{...}{Not used.} \item{replacement}{Numeric or character value that will be used to replace \code{NA}.} \item{verbose}{Toggle warnings.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{replace_num}{Value to replace \code{NA} when variable is of type numeric.} \item{replace_char}{Value to replace \code{NA} when variable is of type character.} \item{replace_fac}{Value to replace \code{NA} when variable is of type factor.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ \code{x}, where \code{NA} values are replaced by \code{replacement}. } \description{ Replace missing values in a variable or a data frame. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ # Convert NA to 0 in a numeric vector convert_na_to( c(9, 3, NA, 2, 3, 1, NA, 8), replacement = 0 ) # Convert NA to "missing" in a character vector convert_na_to( c("a", NA, "d", "z", NA, "t"), replacement = "missing" ) ### For data frames test_df <- data.frame( x = c(1, 2, NA), x2 = c(4, 5, NA), y = c("a", "b", NA) ) # Convert all NA to 0 in numeric variables, and all NA to "missing" in # character variables convert_na_to( test_df, replace_num = 0, replace_char = "missing" ) # Convert a specific variable in the data frame convert_na_to( test_df, replace_num = 0, replace_char = "missing", select = "x" ) # Convert all variables starting with "x" convert_na_to( test_df, replace_num = 0, replace_char = "missing", select = starts_with("x") ) # Convert NA to 1 in variable 'x2' and to 0 in all other numeric # variables convert_na_to( test_df, replace_num = 0, select = list(x2 = 1) ) } ================================================ FILE: man/convert_to_na.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/convert_to_na.R \name{convert_to_na} \alias{convert_to_na} \alias{convert_to_na.numeric} \alias{convert_to_na.factor} \alias{convert_to_na.data.frame} \title{Convert non-missing values in a variable into missing values.} \usage{ convert_to_na(x, ...) \method{convert_to_na}{numeric}(x, na = NULL, verbose = TRUE, ...) \method{convert_to_na}{factor}(x, na = NULL, drop_levels = FALSE, verbose = TRUE, ...) \method{convert_to_na}{data.frame}( x, select = NULL, exclude = NULL, na = NULL, drop_levels = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A vector, factor or a data frame.} \item{...}{Not used.} \item{na}{Numeric, character vector or logical (or a list of numeric, character vectors or logicals) with values that should be converted to \code{NA}. Numeric values applied to numeric vectors, character values are used for factors, character vectors or date variables, and logical values for logical vectors.} \item{verbose}{Toggle warnings.} \item{drop_levels}{Logical, for factors, when specific levels are replaced by \code{NA}, should unused levels be dropped?} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ \code{x}, where all values in \code{na} are converted to \code{NA}. } \description{ Convert non-missing values in a variable into missing values. } \examples{ x <- sample(1:6, size = 30, replace = TRUE) x # values 4 and 5 to NA convert_to_na(x, na = 4:5) # data frames set.seed(123) x <- data.frame( a = sample(1:6, size = 20, replace = TRUE), b = sample(letters[1:6], size = 20, replace = TRUE), c = sample(c(30:33, 99), size = 20, replace = TRUE) ) # for all numerics, convert 5 to NA. Character/factor will be ignored. convert_to_na(x, na = 5) # for numerics, 5 to NA, for character/factor, "f" to NA convert_to_na(x, na = list(6, "f")) # select specific variables convert_to_na(x, select = c("a", "b"), na = list(6, "f")) } ================================================ FILE: man/data_arrange.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_arrange.R \name{data_arrange} \alias{data_arrange} \title{Arrange rows by column values} \usage{ data_arrange(data, select = NULL, safe = TRUE) } \arguments{ \item{data}{A data frame, or an object that can be coerced to a data frame.} \item{select}{Character vector of column names. Use a dash just before column name to arrange in decreasing order, for example \code{"-x1"}.} \item{safe}{Do not throw an error if one of the variables specified doesn't exist.} } \value{ A data frame. } \description{ \code{data_arrange()} orders the rows of a data frame by the values of selected columns. } \examples{ # Arrange using several variables data_arrange(head(mtcars), c("gear", "carb")) # Arrange in decreasing order data_arrange(head(mtcars), "-carb") # Throw an error if one of the variables specified doesn't exist try(data_arrange(head(mtcars), c("gear", "foo"), safe = FALSE)) } ================================================ FILE: man/data_codebook.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_codebook.R \name{data_codebook} \alias{data_codebook} \alias{print_html.data_codebook} \alias{display.data_codebook} \title{Generate a codebook of a data frame.} \usage{ data_codebook( data, select = NULL, exclude = NULL, variable_label_width = NULL, value_label_width = NULL, max_values = 10, range_at = 6, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) \method{print_html}{data_codebook}( x, font_size = "100\%", line_padding = 3, row_color = "#eeeeee", ... ) \method{display}{data_codebook}( object, format = "markdown", font_size = "100\%", line_padding = 3, row_color = "#eeeeee", ... ) } \arguments{ \item{data}{A data frame, or an object that can be coerced to a data frame.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{variable_label_width}{Length of variable labels. Longer labels will be wrapped at \code{variable_label_width} chars. If \code{NULL}, longer labels will not be split into multiple lines. Only applies to \emph{labelled data}.} \item{value_label_width}{Length of value labels. Longer labels will be shortened, where the remaining part is truncated. Only applies to \emph{labelled data} or factor levels.} \item{max_values}{Number of maximum values that should be displayed. Can be used to avoid too many rows when variables have lots of unique values.} \item{range_at}{Indicates how many unique values in a numeric vector are needed in order to print a range for that variable instead of a frequency table for all numeric values. Can be useful if the data contains numeric variables with only a few unique values and where full frequency tables instead of value ranges should be displayed.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings and messages on or off.} \item{...}{Arguments passed to or from other methods.} \item{x}{A (grouped) data frame, a vector or a statistical model (for \code{unstandardize()} cannot be a model).} \item{font_size}{For HTML tables, the font size.} \item{line_padding}{For HTML tables, the distance (in pixel) between lines.} \item{row_color}{For HTML tables, the fill color for odd rows.} \item{object}{An object returned by \code{data_tabulate()}.} \item{format}{String, indicating the output format. Can be \code{"markdown"} \code{"html"}, or \code{"tt"}. \code{format = "html"} create an HTML table using the \emph{gt} package. \code{format = "tt"} creates a \code{tinytable} object, which is either printed as markdown or HTML table, depending on the environment. See \code{\link[insight:export_table]{insight::export_table()}} for details.} } \value{ A formatted data frame, summarizing the content of the data frame. Returned columns include the column index of the variables in the original data frame (\code{ID}), column name, variable label (if data is labelled), type of variable, number of missing values, unique values (or value range), value labels (for labelled data), and a frequency table (N for each value). Most columns are formatted as character vectors. } \description{ \code{data_codebook()} generates codebooks from data frames, i.e. overviews of all variables and some more information about each variable (like labels, values or value range, frequencies, amount of missing values). } \note{ There are methods to \code{print()} the data frame in a nicer output, as well methods for printing in markdown or HTML format (\code{print_md()} and \code{print_html()}). The \code{print()} method for text outputs passes arguments in \code{...} to \code{\link[insight:export_table]{insight::export_table()}}. } \examples{ data(iris) data_codebook(iris, select = starts_with("Sepal")) data(efc) data_codebook(efc) # shorten labels data_codebook(efc, variable_label_width = 20, value_label_width = 15) # automatic range for numerics at more than 5 unique values data(mtcars) data_codebook(mtcars, select = starts_with("c")) # force all values to be displayed data_codebook(mtcars, select = starts_with("c"), range_at = 100) } ================================================ FILE: man/data_duplicated.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_duplicated.R \name{data_duplicated} \alias{data_duplicated} \title{Extract all duplicates} \usage{ data_duplicated( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) } \arguments{ \item{data}{A data frame.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ A dataframe, containing all duplicates. } \description{ Extract all duplicates, for visual inspection. Note that it also contains the first occurrence of future duplicates, unlike \code{\link[=duplicated]{duplicated()}} or \code{\link[dplyr:distinct]{dplyr::distinct()}}). Also contains an additional column reporting the number of missing values for that row, to help in the decision-making when selecting which duplicates to keep. } \examples{ df1 <- data.frame( id = c(1, 2, 3, 1, 3), year = c(2022, 2022, 2022, 2022, 2000), item1 = c(NA, 1, 1, 2, 3), item2 = c(NA, 1, 1, 2, 3), item3 = c(NA, 1, 1, 2, 3) ) data_duplicated(df1, select = "id") data_duplicated(df1, select = c("id", "year")) # Filter to exclude duplicates df2 <- df1[-c(1, 5), ] df2 } \seealso{ \code{\link[=data_unique]{data_unique()}} } \keyword{duplicates} ================================================ FILE: man/data_extract.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_extract.R \name{data_extract} \alias{data_extract} \alias{data_extract.data.frame} \title{Extract one or more columns or elements from an object} \usage{ data_extract(data, select, ...) \method{data_extract}{data.frame}( data, select, name = NULL, extract = "all", as_data_frame = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{data}{The object to subset. Methods are currently available for data frames and data frame extensions (e.g., tibbles).} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{...}{For use by future methods.} \item{name}{An optional argument that specifies the column to be used as names for the vector elements after extraction. Must be specified either as literal variable name (e.g., \code{column_name}) or as string (\code{"column_name"}). \code{name} will be ignored when a data frame is returned.} \item{extract}{String, indicating which element will be extracted when \code{select} matches multiple variables. Can be \code{"all"} (the default) to return all matched variables, \code{"first"} or \code{"last"} to return the first or last match, or \code{"odd"} and \code{"even"} to return all odd-numbered or even-numbered matches. Note that \code{"first"} or \code{"last"} return a vector (unless \code{as_data_frame = TRUE}), while \code{"all"} can return a vector (if only one match was found) \emph{or} a data frame (for more than one match). Type safe return values are only possible when \code{extract} is \code{"first"} or \code{"last"} (will always return a vector) or when \code{as_data_frame = TRUE} (always returns a data frame).} \item{as_data_frame}{Logical, if \code{TRUE}, will always return a data frame, even if only one variable was matched. If \code{FALSE}, either returns a vector or a data frame. See \code{extract} for details.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ A vector (or a data frame) containing the extracted element, or \code{NULL} if no matching variable was found. } \description{ \code{data_extract()} (or its alias \code{extract()}) is similar to \code{$}. It extracts either a single column or element from an object (e.g., a data frame, list), or multiple columns resp. elements. } \details{ \code{data_extract()} can be used to select multiple variables or pull a single variable from a data frame. Thus, the return value is by default not type safe - \code{data_extract()} either returns a vector or a data frame. \subsection{Extracting single variables (vectors)}{ When \code{select} is the name of a single column, or when select only matches one column, a vector is returned. A single variable is also returned when \code{extract} is either \verb{"first} or \code{"last"}. Setting \code{as_data_frame} to \code{TRUE} overrides this behaviour and \emph{always} returns a data frame. } \subsection{Extracting a data frame of variables}{ When \code{select} is a character vector containing more than one column name (or a numeric vector with more than one valid column indices), or when \code{select} uses one of the supported select-helpers that match multiple columns, a data frame is returned. Setting \code{as_data_frame} to \code{TRUE} \emph{always} returns a data frame. } } \examples{ # single variable data_extract(mtcars, cyl, name = gear) data_extract(mtcars, "cyl", name = gear) data_extract(mtcars, -1, name = gear) data_extract(mtcars, cyl, name = 0) data_extract(mtcars, cyl, name = "row.names") # selecting multiple variables head(data_extract(iris, starts_with("Sepal"))) head(data_extract(iris, ends_with("Width"))) head(data_extract(iris, 2:4)) # select first of multiple variables data_extract(iris, starts_with("Sepal"), extract = "first") # select first of multiple variables, return as data frame head(data_extract(iris, starts_with("Sepal"), extract = "first", as_data_frame = TRUE)) } ================================================ FILE: man/data_group.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_group.R \name{data_group} \alias{data_group} \alias{data_ungroup} \title{Create a grouped data frame} \usage{ data_group( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) data_ungroup(data, verbose = TRUE, ...) } \arguments{ \item{data}{A data frame} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} \item{...}{Arguments passed down to other functions. Mostly not used yet.} } \value{ A grouped data frame, i.e. a data frame with additional information about the grouping structure saved as attributes. } \description{ This function is comparable to \code{dplyr::group_by()}, but just following the \strong{datawizard} function design. \code{data_ungroup()} removes the grouping information from a grouped data frame. } \examples{ \dontshow{if (requireNamespace("poorman")) withAutoprint(\{ # examplesIf} data(efc) suppressPackageStartupMessages(library(poorman, quietly = TRUE)) # total mean efc \%>\% summarize(mean_hours = mean(c12hour, na.rm = TRUE)) # mean by educational level efc \%>\% data_group(c172code) \%>\% summarize(mean_hours = mean(c12hour, na.rm = TRUE)) \dontshow{\}) # examplesIf} } ================================================ FILE: man/data_match.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_match.R \name{data_match} \alias{data_match} \alias{data_filter} \title{Return filtered or sliced data frame, or row indices} \usage{ data_match(x, to, match = "and", return_indices = FALSE, remove_na = TRUE, ...) data_filter(x, ...) } \arguments{ \item{x}{A data frame.} \item{to}{A data frame matching the specified conditions. Note that if \code{match} is a value other than \code{"and"}, the original row order might be changed. See 'Details'.} \item{match}{String, indicating with which logical operation matching conditions should be combined. Can be \code{"and"} (or \code{"&"}), \code{"or"} (or \code{"|"}) or \code{"not"} (or \code{"!"}).} \item{return_indices}{Logical, if \code{TRUE}, return the vector of rows that can be used to filter the original data frame. If \code{FALSE} (default), returns directly the filtered data frame instead of the row indices.} \item{remove_na}{Logical, if \code{TRUE}, missing values (\code{NA}s) are removed before filtering the data. This is the default behaviour, however, sometimes when row indices are requested (i.e. \code{return_indices=TRUE}), it might be useful to preserve \code{NA} values, so returned row indices match the row indices of the original data frame.} \item{...}{A sequence of logical expressions indicating which rows to keep, or a numeric vector indicating the row indices of rows to keep. Can also be a string representation of a logical expression (e.g. \code{"x > 4"}), a character vector (e.g. \code{c("x > 4", "y == 2")}) or a variable that contains the string representation of a logical expression. These might be useful when used in packages to avoid defining undefined global variables.} } \value{ A filtered data frame, or the row indices that match the specified configuration. } \description{ Return a filtered (or sliced) data frame or row indices of a data frame that match a specific condition. \code{data_filter()} works like \code{data_match()}, but works with logical expressions or row indices of a data frame to specify matching conditions. } \details{ For \code{data_match()}, if \code{match} is either \code{"or"} or \code{"not"}, the original row order from \code{x} might be changed. If preserving row order is required, use \code{data_filter()} instead. \if{html}{\out{
}}\preformatted{# mimics subset() behaviour, preserving original row order head(data_filter(mtcars[c("mpg", "vs", "am")], vs == 0 | am == 1)) #> mpg vs am #> Mazda RX4 21.0 0 1 #> Mazda RX4 Wag 21.0 0 1 #> Datsun 710 22.8 1 1 #> Hornet Sportabout 18.7 0 0 #> Duster 360 14.3 0 0 #> Merc 450SE 16.4 0 0 # re-sorting rows head(data_match(mtcars[c("mpg", "vs", "am")], data.frame(vs = 0, am = 1), match = "or")) #> mpg vs am #> Mazda RX4 21.0 0 1 #> Mazda RX4 Wag 21.0 0 1 #> Hornet Sportabout 18.7 0 0 #> Duster 360 14.3 0 0 #> Merc 450SE 16.4 0 0 #> Merc 450SL 17.3 0 0 }\if{html}{\out{
}} While \code{data_match()} works with data frames to match conditions against, \code{data_filter()} is basically a wrapper around \verb{subset(subset = )}. However, unlike \code{subset()}, it preserves label attributes and is useful when working with labelled data. } \examples{ data_match(mtcars, data.frame(vs = 0, am = 1)) data_match(mtcars, data.frame(vs = 0, am = c(0, 1))) # observations where "vs" is NOT 0 AND "am" is NOT 1 data_match(mtcars, data.frame(vs = 0, am = 1), match = "not") # equivalent to data_filter(mtcars, vs != 0 & am != 1) # observations where EITHER "vs" is 0 OR "am" is 1 data_match(mtcars, data.frame(vs = 0, am = 1), match = "or") # equivalent to data_filter(mtcars, vs == 0 | am == 1) # slice data frame by row indices data_filter(mtcars, 5:10) # Define a custom function containing data_filter() my_filter <- function(data, variable) { data_filter(data, variable) } my_filter(mtcars, "cyl == 6") # Pass complete filter-condition as string. my_filter <- function(data, condition) { data_filter(data, condition) } my_filter(mtcars, "am != 0") # string can also be used directly as argument data_filter(mtcars, "am != 0") # or as variable fl <- "am != 0" data_filter(mtcars, fl) } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/data_merge.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_merge.R \name{data_merge} \alias{data_merge} \alias{data_join} \alias{data_merge.data.frame} \alias{data_merge.list} \title{Merge (join) two data frames, or a list of data frames} \usage{ data_merge(x, ...) data_join(x, ...) \method{data_merge}{data.frame}(x, y, join = "left", by = NULL, id = NULL, verbose = TRUE, ...) \method{data_merge}{list}(x, join = "left", by = NULL, id = NULL, verbose = TRUE, ...) } \arguments{ \item{x, y}{A data frame to merge. \code{x} may also be a list of data frames that will be merged. Note that the list-method has no \code{y} argument.} \item{...}{Not used.} \item{join}{Character vector, indicating the method of joining the data frames. Can be \code{"full"}, \code{"left"} (default), \code{"right"}, \code{"inner"}, \code{"anti"}, \code{"semi"} or \code{"bind"}. See details below.} \item{by}{Specifications of the columns used for merging.} \item{id}{Optional name for ID column that will be created to indicate the source data frames for appended rows. Only applies if \code{join = "bind"}.} \item{verbose}{Toggle warnings.} } \value{ A merged data frame. } \description{ Merge (join) two data frames, or a list of data frames. However, unlike base R's \code{merge()}, \code{data_merge()} offers a few more methods to join data frames, and it does not drop data frame nor column attributes. } \section{Merging data frames}{ Merging data frames is performed by adding rows (cases), columns (variables) or both from the source data frame (\code{y}) to the target data frame (\code{x}). This usually requires one or more variables which are included in both data frames and that are used for merging, typically indicated with the \code{by} argument. When \code{by} contains a variable present in both data frames, cases are matched and filtered by identical values of \code{by} in \code{x} and \code{y}. } \section{Left- and right-joins}{ Left- and right joins usually don't add new rows (cases), but only new columns (variables) for existing cases in \code{x}. For \code{join = "left"} or \code{join = "right"} to work, \code{by} \emph{must} indicate one or more columns that are included in both data frames. For \code{join = "left"}, if \code{by} is an identifier variable, which is included in both \code{x} and \code{y}, all variables from \code{y} are copied to \code{x}, but only those cases from \code{y} that have matching values in their identifier variable in \code{x} (i.e. all cases in \code{x} that are also found in \code{y} get the related values from the new columns in \code{y}). If there is no match between identifiers in \code{x} and \code{y}, the copied variable from \code{y} will get a \code{NA} value for this particular case. Other variables that occur both in \code{x} and \code{y}, but are not used as identifiers (with \code{by}), will be renamed to avoid multiple identical variable names. Cases in \code{y} where values from the identifier have no match in \code{x}'s identifier are removed. \code{join = "right"} works in a similar way as \code{join = "left"}, just that only cases from \code{x} that have matching values in their identifier variable in \code{y} are chosen. In base R, these are equivalent to \code{merge(x, y, all.x = TRUE)} and \code{merge(x, y, all.y = TRUE)}. } \section{Full joins}{ Full joins copy all cases from \code{y} to \code{x}. For matching cases in both data frames, values for new variables are copied from \code{y} to \code{x}. For cases in \code{y} not present in \code{x}, these will be added as new rows to \code{x}. Thus, full joins not only add new columns (variables), but also might add new rows (cases). In base R, this is equivalent to \code{merge(x, y, all = TRUE)}. } \section{Inner joins}{ Inner joins merge two data frames, however, only those rows (cases) are kept that are present in both data frames. Thus, inner joins usually add new columns (variables), but also remove rows (cases) that only occur in one data frame. In base R, this is equivalent to \code{merge(x, y)}. } \section{Binds}{ \code{join = "bind"} row-binds the complete second data frame \code{y} to \code{x}. Unlike simple \code{rbind()}, which requires the same columns for both data frames, \code{join = "bind"} will bind shared columns from \code{y} to \code{x}, and add new columns from \code{y} to \code{x}. } \examples{ x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3) y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4) x y # "by" will default to all shared columns, i.e. "c" and "id". new columns # "d" and "e" will be copied from "y" to "x", but there are only two cases # in "x" that have the same values for "c" and "id" in "y". only those cases # have values in the copied columns, the other case gets "NA". data_merge(x, y, join = "left") # we change the id-value here x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3) y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 3:5) x y # no cases in "y" have the same matching "c" and "id" as in "x", thus # copied variables from "y" to "x" copy no values, all get NA. data_merge(x, y, join = "left") # one case in "y" has a match in "id" with "x", thus values for this # case from the remaining variables in "y" are copied to "x", all other # values (cases) in those remaining variables get NA data_merge(x, y, join = "left", by = "id") data(mtcars) x <- mtcars[1:5, 1:3] y <- mtcars[28:32, 4:6] # add ID common column x$id <- 1:5 y$id <- 3:7 # left-join, add new variables and copy values from y to x, # where "id" values match data_merge(x, y) # right-join, add new variables and copy values from x to y, # where "id" values match data_merge(x, y, join = "right") # full-join data_merge(x, y, join = "full") data(mtcars) x <- mtcars[1:5, 1:3] y <- mtcars[28:32, c(1, 4:5)] # add ID common column x$id <- 1:5 y$id <- 3:7 # left-join, no matching rows (because columns "id" and "disp" are used) # new variables get all NA values data_merge(x, y) # one common value in "mpg", so one row from y is copied to x data_merge(x, y, by = "mpg") # only keep rows with matching values in by-column data_merge(x, y, join = "semi", by = "mpg") # only keep rows with non-matching values in by-column data_merge(x, y, join = "anti", by = "mpg") # merge list of data frames. can be of different rows x <- mtcars[1:5, 1:3] y <- mtcars[28:31, 3:5] z <- mtcars[11:18, c(1, 3:4, 6:8)] x$id <- 1:5 y$id <- 4:7 z$id <- 3:10 data_merge(list(x, y, z), join = "bind", by = "id", id = "source") } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/data_modify.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_modify.R \name{data_modify} \alias{data_modify} \alias{data_modify.data.frame} \title{Create new variables in a data frame} \usage{ data_modify(data, ...) \method{data_modify}{data.frame}(data, ..., .if = NULL, .at = NULL, .modify = NULL) } \arguments{ \item{data}{A data frame} \item{...}{One or more expressions that define the new variable name and the values or recoding of those new variables. These expressions can be one of: \itemize{ \item A sequence of named, literal expressions, where the left-hand side refers to the name of the new variable, while the right-hand side represent the values of the new variable. Example: \code{Sepal.Width = center(Sepal.Width)}. \item A vector of length 1 (which will be recycled to match the number of rows in the data), or of same length as the data. \item A variable that contains a value to be used. Example: \if{html}{\out{
}}\preformatted{a <- "abc" data_modify(iris, var_abc = a) # var_abc contains "abc" }\if{html}{\out{
}} \item An expression can also be provided as string and wrapped in \code{as_expr()}. Example: \if{html}{\out{
}}\preformatted{data_modify(iris, as_expr("Sepal.Width = center(Sepal.Width)")) # or a <- "center(Sepal.Width)" data_modify(iris, Sepal.Width = as_expr(a)) # or a <- "Sepal.Width = center(Sepal.Width)" data_modify(iris, as_expr(a)) }\if{html}{\out{
}} Note that \code{as_expr()} is no real function, which cannot be used outside of \code{data_modify()}, and hence it is not exported nor documented. Rather, it is only used for internally processing expressions. \item Using \code{NULL} as right-hand side removes a variable from the data frame. Example: \code{Petal.Width = NULL}. \item For data frames (including grouped ones), the function \code{n()} can be used to count the number of observations and thereby, for instance, create index values by using \code{id = 1:n()} or \code{id = 3:(n()+2)} and similar. Note that, like \code{as_expr()}, \code{n()} is also no true function and cannot be used outside of \code{data_modify()}. } Note that newly created variables can be used in subsequent expressions, including \code{.at} or \code{.if}. See also 'Examples'.} \item{.if}{A function that returns \code{TRUE} for columns in the data frame where \code{.if} applies. This argument is used in combination with the \code{.modify} argument. Note that only one of \code{.at} or \code{.if} can be provided, but not both at the same time. Newly created variables in \code{...} can also be selected, see 'Examples'.} \item{.at}{A character vector of variable names that should be modified. This argument is used in combination with the \code{.modify} argument. Note that only one of \code{.at} or \code{.if} can be provided, but not both at the same time. Newly created variables in \code{...} can also be selected, see 'Examples'.} \item{.modify}{A function that modifies the variables defined in \code{.at} or \code{.if}. This argument is used in combination with either the \code{.at} or the \code{.if} argument. Note that the modified variable (i.e. the result from \code{.modify}) must be either of length 1 or of same length as the input variable.} } \description{ Create new variables or modify existing variables in a data frame. Unlike \code{base::transform()}, \code{data_modify()} can be used on grouped data frames, and newly created variables can be directly used. } \note{ \code{data_modify()} can also be used inside functions. However, it is recommended to pass the recode-expression as character vector or list of characters. } \examples{ data(efc) new_efc <- data_modify( efc, c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), c12hour_z2 = standardize(c12hour) ) head(new_efc) # using strings instead of literal expressions new_efc <- data_modify( efc, as_expr("c12hour_c = center(c12hour)"), as_expr("c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)"), as_expr("c12hour_z2 = standardize(c12hour)") ) head(new_efc) # using a character vector, provided a variable xpr <- c( "c12hour_c = center(c12hour)", "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)", "c12hour_z2 = standardize(c12hour)" ) new_efc <- data_modify(efc, as_expr(xpr)) head(new_efc) # using character strings, provided as variable stand <- "c12hour_c / sd(c12hour, na.rm = TRUE)" new_efc <- data_modify( efc, c12hour_c = center(c12hour), c12hour_z = as_expr(stand) ) head(new_efc) # attributes - in this case, value and variable labels - are preserved str(new_efc) # using `paste()` to build a string-expression to_standardize <- c("Petal.Length", "Sepal.Length") out <- data_modify( iris, as_expr( paste0(to_standardize, "_stand = standardize(", to_standardize, ")") ) ) head(out) # overwrite existing variable, remove old variable out <- data_modify(iris, Petal.Length = 1 / Sepal.Length, Sepal.Length = NULL) head(out) # works on grouped data grouped_efc <- data_group(efc, "c172code") new_efc <- data_modify( grouped_efc, c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), c12hour_z2 = standardize(c12hour), id = 1:n() ) head(new_efc) # works from inside functions foo1 <- function(data, ...) { head(data_modify(data, ...)) } foo1(iris, SW_fraction = Sepal.Width / 10) # or foo1(iris, as_expr("SW_fraction = Sepal.Width / 10")) # also with string arguments, using `as_expr()` foo2 <- function(data, modification) { head(data_modify(data, as_expr(modification))) } foo2(iris, "SW_fraction = Sepal.Width / 10") # modify at specific positions or if condition is met d <- iris[1:5, ] data_modify(d, .at = "Species", .modify = as.numeric) data_modify(d, .if = is.factor, .modify = as.numeric) # can be combined with dots data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric) # new variables used in `.at` or `.if` data_modify( d, new_length = Petal.Length * 2, .at = c("Petal.Length", "new_length"), .modify = round ) # combine "extract_column_names()" and ".at" argument out <- data_modify( d, .at = extract_column_names(d, select = starts_with("Sepal")), .modify = as.factor ) # "Sepal.Length" and "Sepal.Width" are now factors str(out) } ================================================ FILE: man/data_partition.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_partition.R \name{data_partition} \alias{data_partition} \title{Partition data} \usage{ data_partition( data, proportion = 0.7, by = NULL, seed = NULL, row_id = ".row_id", verbose = TRUE, ... ) } \arguments{ \item{data}{A data frame.} \item{proportion}{Scalar (between 0 and 1) or numeric vector, indicating the proportion(s) of the training set(s). The sum of \code{proportion} must not be greater than 1. The remaining part will be used for the test set.} \item{by}{A character vector indicating the name(s) of the column(s) used for stratified partitioning.} \item{seed}{A random number generator seed. Enter an integer (e.g. 123) so that the random sampling will be the same each time you run the function.} \item{row_id}{Character string, indicating the name of the column that contains the row-id's.} \item{verbose}{Toggle messages and warnings.} \item{...}{Other arguments passed to or from other functions.} } \value{ A list of data frames. The list includes one training set per given proportion and the remaining data as test set. List elements of training sets are named after the given proportions (e.g., \verb{$p_0.7}), the test set is named \verb{$test}. } \description{ Creates data partitions (for instance, a training and a test set) based on a data frame that can also be stratified (i.e., evenly spread a given factor) using the \code{by} argument. } \examples{ data(iris) out <- data_partition(iris, proportion = 0.9) out$test nrow(out$p_0.9) # Stratify by group (equal proportions of each species) out <- data_partition(iris, proportion = 0.9, by = "Species") out$test # Create multiple partitions out <- data_partition(iris, proportion = c(0.3, 0.3)) lapply(out, head) # Create multiple partitions, stratified by group - 30\% equally sampled # from species in first training set, 50\% in second training set and # remaining 20\% equally sampled from each species in test set. out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species") lapply(out, function(i) table(i$Species)) } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/data_peek.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_peek.R \name{data_peek} \alias{data_peek} \alias{data_peek.data.frame} \title{Peek at values and type of variables in a data frame} \usage{ data_peek(x, ...) \method{data_peek}{data.frame}( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, width = NULL, verbose = TRUE, ... ) } \arguments{ \item{x}{A data frame.} \item{...}{not used.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{width}{Maximum width of line length to display. If \code{NULL}, width will be determined using \code{options()$width}.} \item{verbose}{Toggle warnings.} } \value{ A data frame with three columns, containing information about the name, type and first values of the input data frame. } \description{ This function creates a table a data frame, showing all column names, variable types and the first values (as many as fit into the screen). } \note{ To show only specific or a limited number of variables, use the \code{select} argument, e.g. \code{select = 1:5} to show only the first five variables. } \examples{ data(efc) data_peek(efc) # show variables two to four data_peek(efc, select = 2:4) } ================================================ FILE: man/data_prefix_suffix.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_addprefix.R \name{data_addprefix} \alias{data_addprefix} \alias{data_addsuffix} \title{Add a prefix or suffix to column names} \usage{ data_addprefix( data, pattern, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) data_addsuffix( data, pattern, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{data}{A data frame.} \item{pattern}{A character string, which will be added as prefix or suffix to the column names.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} \item{...}{Other arguments passed to or from other functions.} } \description{ Add a prefix or suffix to column names } \examples{ # Add prefix / suffix to all columns head(data_addprefix(iris, "NEW_")) head(data_addsuffix(iris, "_OLD")) } \seealso{ \code{\link[=data_rename]{data_rename()}} for more fine-grained column renaming. } ================================================ FILE: man/data_read.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_read.R, R/data_write.R \name{data_read} \alias{data_read} \alias{data_write} \title{Read (import) data files from various sources} \usage{ data_read( path, path_catalog = NULL, encoding = NULL, convert_factors = TRUE, password = NULL, verbose = TRUE, ... ) data_write( data, path, delimiter = ",", convert_factors = FALSE, save_labels = FALSE, password = NULL, verbose = TRUE, ... ) } \arguments{ \item{path}{Character string, the file path to the data file.} \item{path_catalog}{Character string, path to the catalog file. Only relevant for SAS data files.} \item{encoding}{The character encoding used for the file. Usually not needed.} \item{convert_factors}{If \code{TRUE} (default), numeric variables, where all values have a value label, are assumed to be categorical and converted into factors. If \code{FALSE}, no variable types are guessed and no conversion of numeric variables into factors will be performed. For \code{data_read()}, this argument only applies to file types with \emph{labelled data}, e.g. files from SPSS, SAS or Stata. See also section 'Differences to other packages'. For \code{data_write()}, this argument only applies to the text (e.g. \code{.txt} or \code{.csv}) or spreadsheet file formats (like \code{.xlsx}). Converting to factors might be useful for these formats because labelled numeric variables are then converted into factors and exported as character columns - else, value labels would be lost and only numeric values are written to the file.} \item{password}{Password for data encryption. If not \code{NULL}, the data will be encrypted (for \code{data_write()}) or decrypted (for \code{data_read()}) using the provided password. Encryption is currently only supported for R file formats (\code{.rds}, \code{.rda} and \code{.rdata}). See the section "Data encryption" below for more information on the encryption method used.} \item{verbose}{Toggle warnings and messages.} \item{...}{Arguments passed to the related \verb{read_*()} or \verb{write_*()} functions.} \item{data}{The data frame that should be written to a file.} \item{delimiter}{For CSV-files, specifies the delimiter. Defaults to \code{","}, but in particular in European regions, \code{";"} might be a useful alternative, especially when exported CSV-files should be opened in Excel.} \item{save_labels}{Only applies to CSV files. If \code{TRUE}, value and variable labels (if any) will be saved as additional CSV file. This file has the same file name as the exported CSV file, but includes a \code{"_labels"} suffix (i.e. when the file name is \code{"mydat.csv"}, the additional file with value and variable labels is named \code{"mydat_labels.csv"}).} } \value{ A data frame. } \description{ This functions imports data from various file types. It is a small wrapper around \code{haven::read_spss()}, \code{haven::read_stata()}, \code{haven::read_sas()}, \code{readxl::read_excel()} and \code{data.table::fread()} resp. \code{readr::read_delim()} (the latter if package \strong{data.table} is not installed). Thus, supported file types for importing data are data files from SPSS, SAS or Stata, Excel files or text files (like '.csv' files). All other file types are passed to \code{rio::import()}. \code{data_write()} works in a similar way. } \section{Supported file types}{ \itemize{ \item \code{data_read()} is a wrapper around the \strong{haven}, \strong{data.table}, \strong{readr} \strong{readxl}, \strong{nanoparquet} and \strong{rio} packages. Currently supported file types are \code{.txt}, \code{.csv}, \code{.xls}, \code{.xlsx}, \code{.sav}, \code{.por}, \code{.dta}, \code{.sas}, \code{.rda}, \code{.parquet}, \code{.rdata}, and \code{.rds} (and related files). All other file types are passed to \code{rio::import()}. \item \code{data_write()} is a wrapper around \strong{haven}, \strong{readr}, \strong{nanoparquet}, and \strong{rio} packages, and supports writing files into all formats supported by these packages. } } \section{Compressed files (zip) and URLs}{ \code{data_read()} can also read the above mentioned files from URLs or from inside zip-compressed files. Thus, \code{path} can also be a URL to a file like \code{"http://www.url.com/file.csv"}. When \code{path} points to a zip-compressed file, and there are multiple files inside the zip-archive, then the first supported file is extracted and loaded. } \section{General behaviour}{ \code{data_read()} detects the appropriate \verb{read_*()} function based on the file-extension of the data file. Thus, in most cases it should be enough to only specify the \code{path} argument. However, if more control is needed, all arguments in \code{...} are passed down to the related \verb{read_*()} function. The same applies to \code{data_write()}, i.e. based on the file extension provided in \code{path}, the appropriate \verb{write_*()} function is used automatically. } \section{SPSS specific behaviour}{ \code{data_read()} does \emph{not} import user-defined ("tagged") \code{NA} values from SPSS, i.e. argument \code{user_na} is always set to \code{FALSE} when importing SPSS data with the \strong{haven} package. Use \code{convert_to_na()} to define missing values in the imported data, if necessary. Furthermore, \code{data_write()} compresses SPSS files by default. If this causes problems with (older) SPSS versions, use \code{compress = "none"}, for example \code{data_write(data, "myfile.sav", compress = "none")}. } \section{Differences to other packages that read foreign data formats}{ \code{data_read()} is most comparable to \code{rio::import()}. For data files from SPSS, SAS or Stata, which support labelled data, variables are converted into their most appropriate type. The major difference to \code{rio::import()} is for data files from SPSS, SAS, or Stata, i.e. file types that support \emph{labelled data}. \code{data_read()} automatically converts fully labelled numeric variables into factors, where imported value labels will be set as factor levels. If a numeric variable has \emph{no} value labels or less value labels than values, it is not converted to factor. In this case, value labels are preserved as \code{"labels"} attribute. Character vectors are preserved. Use \code{convert_factors = FALSE} to remove the automatic conversion of numeric variables to factors. } \section{Data encryption}{ \code{data_read()} and \code{data_write()} support data encryption for R file formats (\code{.rds}, \code{.rda} and \code{.rdata}). To encrypt a file, provide a password to the \code{password} argument in \code{data_write()}. To decrypt the file, provide the same password to \code{data_read()}. The encryption is based on the \strong{openssl} package and uses the AES-GCM algorithm (see \code{?openssl::aes_gcm_encrypt}) with a 256-bit key (see \code{?openssl::sha256}). Thus, data can also be decrypted without relying on the \strong{datawizard} package, e.g. using following code: \if{html}{\out{
}}\preformatted{encrypted_data <- readRDS(datafile) key <- openssl::sha256(charToRaw("")) out <- openssl::aes_gcm_decrypt(encrypted_data, key = key) decrypted_data <- unserialize(out) }\if{html}{\out{
}} \strong{Warning:} Do not lose your \code{password}, else you will not be able to decrypt the data again! } ================================================ FILE: man/data_relocate.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_relocate.R, R/data_remove.R \name{data_relocate} \alias{data_relocate} \alias{data_reorder} \alias{data_remove} \title{Relocate (reorder) columns of a data frame} \usage{ data_relocate( data, select, before = NULL, after = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) data_reorder( data, select, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) data_remove( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = FALSE, ... ) } \arguments{ \item{data}{A data frame.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{before, after}{Destination of columns. Supplying neither will move columns to the left-hand side; specifying both is an error. Can be a character vector, indicating the name of the destination column, or a numeric value, indicating the index number of the destination column. If \code{-1}, will be added before or after the last column.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} \item{...}{Arguments passed down to other functions. Mostly not used yet.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} } \value{ A data frame with reordered columns. } \description{ \code{data_relocate()} will reorder columns to specific positions, indicated by \code{before} or \code{after}. \code{data_reorder()} will instead move selected columns to the beginning of a data frame. Finally, \code{data_remove()} removes columns from a data frame. All functions support select-helpers that allow flexible specification of a search pattern to find matching columns, which should be reordered or removed. } \examples{ # Reorder columns head(data_relocate(iris, select = "Species", before = "Sepal.Length")) head(data_relocate(iris, select = "Species", before = "Sepal.Width")) head(data_relocate(iris, select = "Sepal.Width", after = "Species")) # which is same as head(data_relocate(iris, select = "Sepal.Width", after = -1)) # Reorder multiple columns head(data_relocate(iris, select = c("Species", "Petal.Length"), after = "Sepal.Width")) # which is same as head(data_relocate(iris, select = c("Species", "Petal.Length"), after = 2)) # Reorder columns head(data_reorder(iris, c("Species", "Sepal.Length"))) # Remove columns head(data_remove(iris, "Sepal.Length")) head(data_remove(iris, starts_with("Sepal"))) } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/data_rename.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_rename.R \name{data_rename} \alias{data_rename} \alias{data_rename_rows} \title{Rename columns and variable names} \usage{ data_rename(data, select = NULL, replacement = NULL, ...) data_rename_rows(data, rows = NULL) } \arguments{ \item{data}{A data frame.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{replacement}{Character vector. Can be one of the following: \itemize{ \item A character vector that indicates the new names of the columns selected in \code{select}. \code{select} and \code{replacement} must be of the same length. \item A string (i.e. character vector of length 1) with a "glue" styled pattern. Currently supported tokens are: \itemize{ \item \code{{col}} which will be replaced by the column name, i.e. the corresponding value in \code{select}. \item \code{{n}} will be replaced by the number of the variable that is replaced. \item \code{{letter}} will be replaced by alphabetical letters in sequential order. If more than 26 letters are required, letters are repeated, but have sequential numeric indices (e.g., \code{a1} to \code{z1}, followed by \code{a2} to \code{z2}). \item Finally, the name of a user-defined object that is available in the environment can be used. Note that the object's name is not allowed to be one of the pre-defined tokens, \code{"col"}, \code{"n"} and \code{"letter"}. } An example for the use of tokens is... \if{html}{\out{
}}\preformatted{data_rename( mtcars, select = c("am", "vs"), replacement = "new_name_from_\{col\}" ) }\if{html}{\out{
}} ... which would return new column names \code{new_name_from_am} and \code{new_name_from_vs}. See 'Examples'. } If \code{select} is a named vector, \code{replacement} is ignored.} \item{...}{Other arguments passed to or from other functions.} \item{rows}{Vector of row names.} } \value{ A modified data frame. } \description{ Safe and intuitive functions to rename variables or rows in data frames. \code{data_rename()} will rename column names, i.e. it facilitates renaming variables. \code{data_rename_rows()} is a convenient shortcut to add or rename row names of a data frame, but unlike \code{row.names()}, its input and output is a data frame, thus, integrating smoothly into a possible pipe-workflow. } \details{ \code{select} can also be a named character vector. In this case, the names are used to rename the columns in the output data frame. If you have a named list, use \code{unlist()} to convert it to a named vector. See 'Examples'. } \examples{ # Rename columns head(data_rename(iris, "Sepal.Length", "length")) # Use named vector to rename head(data_rename(iris, c(length = "Sepal.Length", width = "Sepal.Width"))) # Change all head(data_rename(iris, replacement = paste0("Var", 1:5))) # Use glue-styled patterns head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "formerly_{col}")) head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "{col}_is_column_{n}")) head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "new_{letter}")) # User-defined glue-styled patterns from objects in environment x <- c("hi", "there", "!") head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}")) } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/data_replicate.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_replicate.R \name{data_replicate} \alias{data_replicate} \title{Expand (i.e. replicate rows) a data frame} \usage{ data_replicate( data, expand = NULL, select = NULL, exclude = NULL, remove_na = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) } \arguments{ \item{data}{A data frame.} \item{expand}{The name of the column that contains the counts of replications for each row. Can also be a numeric value, indicating the position of that column. Note that the variable indicated by \code{expand} must be an integer vector.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{remove_na}{Logical. If \code{TRUE}, missing values in the column provided in \code{expand} are removed from the data frame. If \code{FALSE} and \code{expand} contains missing values, the function will throw an error.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{verbose}{Toggle warnings.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{...}{Currently not used.} } \value{ A dataframe with each row replicated as many times as defined in \code{expand}. } \description{ Expand a data frame by replicating rows based on another variable that contains the counts of replications per row. } \examples{ data(mtcars) data_replicate(head(mtcars), "carb") } ================================================ FILE: man/data_restoretype.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_restoretype.R \name{data_restoretype} \alias{data_restoretype} \title{Restore the type of columns according to a reference data frame} \usage{ data_restoretype(data, reference = NULL, ...) } \arguments{ \item{data}{A data frame for which to restore the column types.} \item{reference}{A reference data frame from which to find the correct column types. If \code{NULL}, each column is converted to numeric if it doesn't generate \code{NA}s. For example, \code{c("1", "2")} can be converted to numeric but not \code{c("Sepal.Length")}.} \item{...}{Currently not used.} } \value{ A data frame with columns whose types have been restored based on the reference data frame. } \description{ Restore the type of columns according to a reference data frame } \examples{ data <- data.frame( Sepal.Length = c("1", "3", "2"), Species = c("setosa", "versicolor", "setosa"), New = c("1", "3", "4") ) fixed <- data_restoretype(data, reference = iris) summary(fixed) } ================================================ FILE: man/data_rotate.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_rotate.R \name{data_rotate} \alias{data_rotate} \alias{data_transpose} \title{Rotate a data frame} \usage{ data_rotate(data, rownames = NULL, colnames = FALSE, verbose = TRUE) data_transpose(data, rownames = NULL, colnames = FALSE, verbose = TRUE) } \arguments{ \item{data}{A data frame.} \item{rownames}{Character vector (optional). If not \code{NULL}, the data frame's rownames will be added as (first) column to the output, with \code{rownames} being the name of this column.} \item{colnames}{Logical or character vector (optional). If \code{TRUE}, the values of the first column in \code{x} will be used as column names in the rotated data frame. If a character vector, values from that column are used as column names.} \item{verbose}{Toggle warnings.} } \value{ A (rotated) data frame. } \description{ This function rotates a data frame, i.e. columns become rows and vice versa. It's the equivalent of using \code{t()} but restores the \code{data.frame} class, preserves attributes and prints a warning if the data type is modified (see example). } \examples{ x <- mtcars[1:3, 1:4] x data_rotate(x) data_rotate(x, rownames = "property") # use values in 1. column as column name data_rotate(x, colnames = TRUE) data_rotate(x, rownames = "property", colnames = TRUE) # use either first column or specific column for column names x <- data.frame(a = 1:5, b = 11:15, c = 21:25) data_rotate(x, colnames = TRUE) data_rotate(x, colnames = "c") } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/data_seek.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_seek.R \name{data_seek} \alias{data_seek} \title{Find variables by their names, variable or value labels} \usage{ data_seek(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) } \arguments{ \item{data}{A data frame.} \item{pattern}{Character string (regular expression) to be matched in \code{data}. May also be a character vector of length > 1. \code{pattern} is searched for in column names, variable label and value labels attributes, or factor levels of variables in \code{data}.} \item{seek}{Character vector, indicating where \code{pattern} is sought. Use one or more of the following options: \itemize{ \item \code{"names"}: Searches in column names. \code{"column_names"} and \code{"columns"} are aliases for \code{"names"}. \item \code{"labels"}: Searches in variable labels. Only applies when a \code{label} attribute is set for a variable. \item \code{"values"}: Searches in value labels or factor levels. Only applies when a \code{labels} attribute is set for a variable, or if a variable is a factor. \code{"levels"} is an alias for \code{"values"}. \item \code{"all"}: Searches in all of the above. }} \item{fuzzy}{Logical. If \code{TRUE}, "fuzzy matching" (partial and close distance matching) will be used to find \code{pattern}.} } \value{ A data frame with three columns: the column index, the column name and - if available - the variable label of all matched variables in \code{data}. } \description{ This functions seeks variables in a data frame, based on patterns that either match the variable name (column name), variable labels, value labels or factor levels. Matching variable and value labels only works for "labelled" data, i.e. when the variables either have a \code{label} attribute or \code{labels} attribute. \code{data_seek()} is particular useful for larger data frames with labelled data - finding the correct variable name can be a challenge. This function helps to find the required variables, when only certain patterns of variable names or labels are known. } \examples{ # seek variables with "Length" in variable name or labels data_seek(iris, "Length") # seek variables with "dependency" in names or labels # column "e42dep" has a label-attribute "elder's dependency" data(efc) data_seek(efc, "dependency") # "female" only appears as value label attribute - default search is in # variable names and labels only, so no match data_seek(efc, "female") # when we seek in all sources, we find the variable "e16sex" data_seek(efc, "female", seek = "all") # typo, no match data_seek(iris, "Lenght") # typo, fuzzy match data_seek(iris, "Lenght", fuzzy = TRUE) } ================================================ FILE: man/data_separate.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_separate.R \name{data_separate} \alias{data_separate} \title{Separate single variable into multiple variables} \usage{ data_separate( data, select = NULL, new_columns = NULL, separator = "[^[:alnum:]]+", guess_columns = NULL, merge_multiple = FALSE, merge_separator = "", fill = "right", extra = "drop_right", convert_na = TRUE, exclude = NULL, append = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) } \arguments{ \item{data}{A data frame.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{new_columns}{The names of the new columns, as character vector. If more than one variable was selected (in \code{select}), the new names are prefixed with the name of the original column. \code{new_columns} can also be a list of (named) character vectors when multiple variables should be separated. See 'Examples'.} \item{separator}{Separator between columns. Can be a character vector, which is then treated as regular expression, or a numeric vector that indicates at which positions the string values will be split.} \item{guess_columns}{If \code{new_columns} is not given, the required number of new columns is guessed based on the results of value splitting. For example, if a variable is split into three new columns, this will be considered as the required number of new columns, and columns are named \code{"split_1"}, \code{"split_2"} and \code{"split_3"}. When values from a variable are split into different amount of new columns, the \code{guess_column} can be either \code{"mode"} (number of new columns is based on the most common number of splits), \code{"min"} or \code{"max"} to use the minimum resp. maximum number of possible splits as required number of columns.} \item{merge_multiple}{Logical, if \code{TRUE} and more than one variable is selected for separating, new columns can be merged. Value pairs of all split variables are merged.} \item{merge_separator}{Separator string when \code{merge_multiple = TRUE}. Defines the string that is used to merge values together.} \item{fill}{How to deal with values that return fewer new columns after splitting? Can be \code{"left"} (fill missing columns from the left with \code{NA}), \code{"right"} (fill missing columns from the right with \code{NA}) or \code{"value_left"} or \code{"value_right"} to fill missing columns from left or right with the left-most or right-most values.} \item{extra}{How to deal with values that return too many new columns after splitting? Can be \code{"drop_left"} or \code{"drop_right"} to drop the left-most or right-most values, or \code{"merge_left"} or \code{"merge_right"} to merge the left- or right-most value together, and keeping all remaining values as is.} \item{convert_na}{Logical, if \code{TRUE}, character \code{"NA"} values are converted into real \code{NA} values.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{append}{Logical, if \code{FALSE} (default), removes original columns that were separated. If \code{TRUE}, all columns are preserved and the new columns are appended to the data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{verbose}{Toggle warnings.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{...}{Currently not used.} } \value{ A data frame with the newly created variable(s), or - when \code{append = TRUE} - \code{data} including new variables. } \description{ Separates a single variable into multiple new variables. } \examples{ # simple case d <- data.frame( x = c("1.a.6", "2.b.7", "3.c.8"), stringsAsFactors = FALSE ) d data_separate(d, new_columns = c("a", "b", "c")) # guess number of columns d <- data.frame( x = c("1.a.6", NA, "2.b.6.7", "3.c", "x.y.z"), stringsAsFactors = FALSE ) d data_separate(d, guess_columns = "mode") data_separate(d, guess_columns = "max") # drop left-most column data_separate(d, guess_columns = "mode", extra = "drop_left") # merge right-most column data_separate(d, guess_columns = "mode", extra = "merge_right") # fill columns with fewer values with left-most values data_separate(d, guess_columns = "mode", fill = "value_left") # fill and merge data_separate( d, guess_columns = "mode", fill = "value_left", extra = "merge_right" ) # multiple columns to split d <- data.frame( x = c("1.a.6", "2.b.7", "3.c.8"), y = c("x.y.z", "10.11.12", "m.n.o"), stringsAsFactors = FALSE ) d # split two columns, default column names data_separate(d, guess_columns = "mode") # split into new named columns, repeating column names data_separate(d, new_columns = c("a", "b", "c")) # split selected variable new columns data_separate(d, select = "y", new_columns = c("a", "b", "c")) # merge multiple split columns data_separate( d, new_columns = c("a", "b", "c"), merge_multiple = TRUE ) # merge multiple split columns data_separate( d, new_columns = c("a", "b", "c"), merge_multiple = TRUE, merge_separator = "-" ) # separate multiple columns, give proper column names d_sep <- data.frame( x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), y = c("m.n.99.22", "77.f.g.34", "44.9", NA), stringsAsFactors = FALSE ) data_separate( d_sep, select = c("x", "y"), new_columns = list( x = c("A", "B", "C"), # separate "x" into three columns y = c("EE", "FF", "GG", "HH") # separate "y" into four columns ), verbose = FALSE ) } \seealso{ \code{\link[=data_unite]{data_unite()}} } ================================================ FILE: man/data_summary.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_summary.R \name{data_summary} \alias{data_summary} \alias{data_summary.data.frame} \title{Summarize data} \usage{ data_summary(x, ...) \method{data_summary}{data.frame}(x, ..., by = NULL, remove_na = FALSE, suffix = NULL) } \arguments{ \item{x}{A (grouped) data frame.} \item{...}{One or more named expressions that define the new variable name and the function to compute the summary statistic. Example: \code{mean_sepal_width = mean(Sepal.Width)}. The expression can also be provided as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}. The summary function \code{n()} can be used to count the number of observations.} \item{by}{Optional character string, indicating the names of one or more variables in the data frame. If supplied, the data will be split by these variables and summary statistics will be computed for each group.} \item{remove_na}{Logical. If \code{TRUE}, missing values are omitted from the grouping variable. If \code{FALSE} (default), missing values are included as a level in the grouping variable.} \item{suffix}{Optional, suffixes to be added to the new variable names, especially useful when a function returns several values (e.g. \code{quantile()}). Can be: \itemize{ \item a character vector: all expressions in \code{...} must return the same number of values as elements in \code{suffix}. \item a list of named character vectors: the names of elements in \code{suffix} must match the names of the expressions. It is also allowed to specify suffixes for selected expressions only. } The new column names are a combination of the left-hand side (i.e., the name) of the expression and the related suffixes. If \code{suffix = NULL} (the default), and a summary expression returns multiple values, either the names of the returned values (if any) or automatically numbered suffixes such as \verb{_1}, \verb{_2}, etc. are used. See 'Examples'.} } \value{ A data frame with the requested summary statistics. } \description{ This function can be used to compute summary statistics for a data frame or a matrix. } \examples{ data(iris) data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) data_summary( iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = "Species" ) # same as d <- data_group(iris, "Species") data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) # multiple groups data(mtcars) data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear")) # expressions can also be supplied as character strings data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear")) # count observations within groups data_summary(mtcars, observations = n(), by = c("am", "gear")) # first and last observations of "mpg" within groups data_summary( mtcars, first = mpg[1], last = mpg[length(mpg)], by = c("am", "gear") ) # allow more than one-column-summaries for expressions d <- data.frame( x = rnorm(100, 1, 1), y = rnorm(100, 2, 2), groups = rep(1:4, each = 25) ) # since we have multiple columns for one expression, the names of the # returned summary results are used as suffix by default data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)) ) # if a summary function, like `fivenum()`, returns no named vector, suffixes # are automatically numbered data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), fivenum_y = fivenum(y) ) # specify column suffix for expressions, matching by names data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = list(quant_y = c("_Q1", "_Q2", "_Q3")) ) # name multiple expression suffixes, grouped by variable data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q3")), by = "groups" ) } ================================================ FILE: man/data_tabulate.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_tabulate.R \name{data_tabulate} \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} \alias{print.datawizard_table} \alias{display.datawizard_table} \title{Create frequency and crosstables of variables} \usage{ data_tabulate(x, ...) \method{data_tabulate}{default}( x, by = NULL, drop_levels = FALSE, weights = NULL, remove_na = FALSE, proportions = NULL, name = NULL, verbose = TRUE, ... ) \method{data_tabulate}{data.frame}( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, by = NULL, drop_levels = FALSE, weights = NULL, remove_na = FALSE, proportions = NULL, collapse = FALSE, verbose = TRUE, ... ) \method{print}{datawizard_table}(x, big_mark = NULL, ...) \method{display}{datawizard_table}(object, big_mark = NULL, format = "markdown", ...) } \arguments{ \item{x}{A (grouped) data frame, a vector or factor.} \item{...}{not used.} \item{by}{Optional vector or factor. If supplied, a crosstable is created. If \code{x} is a data frame, \code{by} can also be a character string indicating the name of a variable in \code{x}.} \item{drop_levels}{Logical, if \code{FALSE}, factor levels that do not occur in the data are included in the table (with frequency of zero), else unused factor levels are dropped from the frequency table.} \item{weights}{Optional numeric vector of weights. Must be of the same length as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.} \item{remove_na}{Logical, if \code{FALSE}, missing values are included in the frequency or crosstable, else missing values are omitted. Note that the default for the \code{as.table()} method is \code{remove_na = TRUE}, so that missing values are not included in the returned table, which makes more sense for post-processing of the table, e.g. using \code{chisq.test()}.} \item{proportions}{Optional character string, indicating the type of percentages to be calculated. Only applies to crosstables, i.e. when \code{by} is not \code{NULL}. Can be \code{"row"} (row percentages), \code{"column"} (column percentages) or \code{"full"} (to calculate relative frequencies for the full table).} \item{name}{Optional character string, which includes the name that is used for printing.} \item{verbose}{Toggle warnings and messages.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{collapse}{Logical, if \code{TRUE} collapses multiple tables into one larger table for printing. This affects only printing, not the returned object.} \item{big_mark}{Optional character string, indicating the big mark that is used for large numbers. If \code{NULL} (default), a big mark is added automatically for large numbers (i.e. numbers with more than 5 digits). If you want to remove the big mark, set \code{big_mark = ""}.} \item{object}{An object returned by \code{data_tabulate()}.} \item{format}{String, indicating the output format. Can be \code{"markdown"} \code{"html"}, or \code{"tt"}. \code{format = "html"} create an HTML table using the \emph{gt} package. \code{format = "tt"} creates a \code{tinytable} object, which is either printed as markdown or HTML table, depending on the environment. See \code{\link[insight:export_table]{insight::export_table()}} for details.} } \value{ A data frame, or a list of data frames, with one frequency table as data frame per variable. } \description{ This function creates frequency or crosstables of variables, including the number of levels/values as well as the distribution of raw, valid and cumulative percentages. For crosstables, row, column and cell percentages can be calculated. } \details{ There is an \code{as.data.frame()} method, to return the frequency tables as a data frame. The structure of the returned object is a nested data frame, where the first column contains name of the variable for which frequencies were calculated, and the second column is a list column that contains the frequency tables as data frame. See \link{as.table.datawizard_table}. There is also an \code{as.table()} method, which returns a table object with the frequencies of the variable. This is useful for further statistical analysis, e.g. for using \code{chisq.test()} on the frequency table. See \link{as.table.datawizard_table}. } \note{ There are \code{print_html()} and \code{print_md()} methods available for printing frequency or crosstables in HTML and markdown format, e.g. \code{print_html(data_tabulate(x))}. The \code{print()} method for text outputs passes arguments in \code{...} to \code{\link[insight:export_table]{insight::export_table()}}. } \section{Crosstables}{ If \code{by} is supplied, a crosstable is created. The crosstable includes \verb{} (missing) values by default. The first column indicates values of \code{x}, the first row indicates values of \code{by} (including missing values). The last row and column contain the total frequencies for each row and column, respectively. Setting \code{remove_na = FALSE} will omit missing values from the crosstable. Setting \code{proportions} to \code{"row"} or \code{"column"} will add row or column percentages. Setting \code{proportions} to \code{"full"} will add relative frequencies for the full table. } \examples{ \dontshow{if (requireNamespace("poorman")) withAutoprint(\{ # examplesIf} # frequency tables ------- # ------------------------ data(efc) # vector/factor data_tabulate(efc$c172code) # drop missing values data_tabulate(efc$c172code, remove_na = TRUE) # data frame data_tabulate(efc, c("e42dep", "c172code")) # grouped data frame suppressPackageStartupMessages(library(poorman, quietly = TRUE)) efc \%>\% group_by(c172code) \%>\% data_tabulate("e16sex") # collapse tables efc \%>\% group_by(c172code) \%>\% data_tabulate("e16sex", collapse = TRUE) # for larger N's (> 100000), a big mark is automatically added set.seed(123) x <- sample(1:3, 1e6, TRUE) data_tabulate(x, name = "Large Number") # to remove the big mark, use "print(..., big_mark = "")" print(data_tabulate(x), big_mark = "") # weighted frequencies set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) data_tabulate(efc$e42dep, weights = efc$weights) # crosstables ------ # ------------------ # add some missing values set.seed(123) efc$e16sex[sample.int(nrow(efc), 5)] <- NA data_tabulate(efc, "c172code", by = "e16sex") # add row and column percentages data_tabulate(efc, "c172code", by = "e16sex", proportions = "row") data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") # omit missing values data_tabulate( efc$c172code, by = efc$e16sex, proportions = "column", remove_na = TRUE ) # round percentages out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") print(out, digits = 0) \dontshow{\}) # examplesIf} } \seealso{ \link{as.prop.table} } ================================================ FILE: man/data_to_long.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_to_long.R \name{data_to_long} \alias{data_to_long} \alias{reshape_longer} \title{Reshape (pivot) data from wide to long} \usage{ data_to_long( data, select = "all", names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, values_to = "value", values_drop_na = FALSE, rows_to = NULL, ignore_case = FALSE, regex = FALSE, ..., cols ) reshape_longer( data, select = "all", names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, values_to = "value", values_drop_na = FALSE, rows_to = NULL, ignore_case = FALSE, regex = FALSE, ..., cols ) } \arguments{ \item{data}{A data frame to convert to long format, so that it has more rows and fewer columns after the operation.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{names_to}{The name of the new column (variable) that will contain the \emph{names} from columns in \code{select} as values, to identify the source of the values. \code{names_to} can be a character vector with more than one column name, in which case \code{names_sep} or \code{names_pattern} must be provided in order to identify which parts of the column names go into newly created columns. See also 'Examples'.} \item{names_prefix}{A regular expression used to remove matching text from the start of each variable name.} \item{names_sep, names_pattern}{If \code{names_to} contains multiple values, this argument controls how the column name is broken up. \code{names_pattern} takes a regular expression containing matching groups, i.e. "()".} \item{values_to}{The name of the new column that will contain the \emph{values} of the columns in \code{select}.} \item{values_drop_na}{If \code{TRUE}, will drop rows that contain only \code{NA} in the \code{values_to} column. This effectively converts explicit missing values to implicit missing values, and should generally be used only when missing values in data were created by its structure.} \item{rows_to}{The name of the column that will contain the row names or row numbers from the original data. If \code{NULL}, will be removed.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{...}{Currently not used.} \item{cols}{Identical to \code{select}. This argument is here to ensure compatibility with \code{tidyr::pivot_longer()}. If both \code{select} and \code{cols} are provided, \code{cols} is used.} } \value{ If a tibble was provided as input, \code{reshape_longer()} also returns a tibble. Otherwise, it returns a data frame. } \description{ This function "lengthens" data, increasing the number of rows and decreasing the number of columns. This is a dependency-free base-R equivalent of \code{tidyr::pivot_longer()}. } \details{ Reshaping data into long format usually means that the input data frame is in \emph{wide} format, where multiple measurements taken on the same subject are stored in multiple columns (variables). The long format stores the same information in a single column, with each measurement per subject stored in a separate row. The values of all variables that are not in \code{select} will be repeated. The necessary information for \code{data_to_long()} is: \itemize{ \item The columns that contain the repeated measurements (\code{select}). \item The name of the newly created column that will contain the names of the columns in \code{select} (\code{names_to}), to identify the source of the values. \code{names_to} can also be a character vector with more than one column name, in which case \code{names_sep} or \code{names_pattern} must be provided to specify which parts of the column names go into the newly created columns. \item The name of the newly created column that contains the values of the columns in \code{select} (\code{values_to}). } In other words: repeated measurements that are spread across several columns will be gathered into a single column (\code{values_to}), with the original column names, that identify the source of the gathered values, stored in one or more new columns (\code{names_to}). } \examples{ \dontshow{if (all(insight::check_if_installed(c("psych", "tidyr"), quietly = TRUE))) withAutoprint(\{ # examplesIf} wide_data <- setNames( data.frame(replicate(2, rnorm(8))), c("Time1", "Time2") ) wide_data$ID <- 1:8 wide_data # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3)) # probably doesn't make much sense to mix "time" and "id" data_to_long(wide_data) # Customizing the names data_to_long( wide_data, select = c("Time1", "Time2"), names_to = "Timepoint", values_to = "Score" ) # Reshape multiple columns into long format. mydat <- data.frame( age = c(20, 30, 40), sex = c("Female", "Male", "Male"), score_t1 = c(30, 35, 32), score_t2 = c(33, 34, 37), score_t3 = c(36, 35, 38), speed_t1 = c(2, 3, 1), speed_t2 = c(3, 4, 5), speed_t3 = c(1, 8, 6) ) # The column names are split into two columns: "type" and "time". The # pattern for splitting column names is provided in `names_pattern`. Values # of all "score_*" and "speed_*" columns are gathered into a single column # named "count". data_to_long( mydat, select = 3:8, names_to = c("type", "time"), names_pattern = "(score|speed)_t(\\\\d+)", values_to = "count" ) # Full example # ------------------ data <- psych::bfi # Wide format with one row per participant's personality test # Pivot long format very_long_data <- data_to_long(data, select = regex("\\\\d"), # Select all columns that contain a digit names_to = "Item", values_to = "Score", rows_to = "Participant" ) head(very_long_data) even_longer_data <- data_to_long( tidyr::who, select = new_sp_m014:newrel_f65, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ) head(even_longer_data) \dontshow{\}) # examplesIf} } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/data_to_wide.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_to_wide.R \name{data_to_wide} \alias{data_to_wide} \alias{reshape_wider} \title{Reshape (pivot) data from long to wide} \usage{ data_to_wide( data, id_cols = NULL, values_from = "Value", names_from = "Name", names_sep = "_", names_prefix = "", names_glue = NULL, values_fill = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) reshape_wider( data, id_cols = NULL, values_from = "Value", names_from = "Name", names_sep = "_", names_prefix = "", names_glue = NULL, values_fill = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{data}{A data frame to convert to wide format, so that it has more columns and fewer rows post-widening than pre-widening.} \item{id_cols}{The name of the column that identifies the rows in the data by which observations are grouped and the gathered data is spread into new columns. Usually, this is a variable containing an ID for observations that have been repeatedly measured. If \code{NULL}, it will use all remaining columns that are not in \code{names_from} or \code{values_from} as ID columns. \code{id_cols} can also be a character vector with more than one name of identifier columns. See also 'Details' and 'Examples'.} \item{values_from}{The name of the columns in the original data that contains the values used to fill the new columns created in the widened data. Can also be one of the selection helpers (see argument \code{select} in \code{\link[=data_select]{data_select()}}).} \item{names_from}{The name of the column in the original data whose values will be used for naming the new columns created in the widened data. Each unique value in this column will become the name of one of these new columns. In case \code{names_prefix} is provided, column names will be concatenated with the string given in \code{names_prefix}. If \code{values_from} specifies more than one variable that should be widened, the new column names are a combination of the old column names in \code{values_from} and the \emph{values} from \code{names_from}, to avoid duplicate column names.} \item{names_sep}{If \code{names_from} or \code{values_from} contains multiple variables, this will be used to join their values together into a single string to use as a column name.} \item{names_prefix}{String added to the start of every variable name. This is particularly useful if \code{names_from} is a numeric vector and you want to create syntactic variable names.} \item{names_glue}{Instead of \code{names_sep} and \code{names_prefix}, you can supply a \href{https://glue.tidyverse.org/index.html}{glue specification} that uses the \code{names_from} columns to create custom column names. Note that the only delimiters supported by \code{names_glue} are curly brackets, \verb{\{} and \verb{\}}.} \item{values_fill}{Defunct argument, which has no function anymore. Will be removed in future versions.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} \item{...}{Not used for now.} } \value{ If a tibble was provided as input, \code{data_to_wide()} also returns a tibble. Otherwise, it returns a data frame. } \description{ This function "widens" data, increasing the number of columns and decreasing the number of rows. This is a dependency-free base-R equivalent of \code{tidyr::pivot_wider()}. } \details{ Reshaping data into wide format usually means that the input data frame is in \emph{long} format, where multiple measurements taken on the same subject are stored in multiple rows. The wide format stores the same information in a single row, with each measurement stored in a separate column. Thus, the necessary information for \code{data_to_wide()} is: \itemize{ \item The name of the column(s) that identify the groups or repeated measurements (\code{id_cols}). \item The name of the column whose \emph{values} will become the new column names (\code{names_from}). Since these values may not necessarily reflect appropriate column names, you can use \code{names_prefix} to add a prefix to each newly created column name. \item The name of the column(s) that contain the values (\code{values_from}) for the new columns that are created by \code{names_from}. } In other words: repeated measurements, as indicated by \code{id_cols}, that are saved into the column \code{values_from} will be spread into new columns, which will be named after the values in \code{names_from}. See also 'Examples'. } \examples{ \dontshow{if (requireNamespace("lme4", quietly = TRUE)) withAutoprint(\{ # examplesIf} data_long <- read.table(header = TRUE, text = " subject sex condition measurement 1 M control 7.9 1 M cond1 12.3 1 M cond2 10.7 2 F control 6.3 2 F cond1 10.6 2 F cond2 11.1 3 F control 9.5 3 F cond1 13.1 3 F cond2 13.8 4 M control 11.5 4 M cond1 13.4 4 M cond2 12.9") # converting long data into wide format data_to_wide( data_long, id_cols = "subject", names_from = "condition", values_from = "measurement" ) # converting long data into wide format with custom column names data_to_wide( data_long, id_cols = "subject", names_from = "condition", values_from = "measurement", names_prefix = "Var.", names_sep = "." ) # converting long data into wide format, combining multiple columns production <- expand.grid( product = c("A", "B"), country = c("AI", "EI"), year = 2000:2014 ) production <- data_filter(production, (product == "A" & country == "AI") | product == "B") production$production <- rnorm(nrow(production)) data_to_wide( production, names_from = c("product", "country"), values_from = "production", names_glue = "prod_{product}_{country}" ) # reshaping multiple long columns into wide format. to avoid duplicate # column names, new names are a combination of the old column names in # `values_from` and the values from `names_from` data_long <- read.table(header = TRUE, text = " subject_id time score anxiety test 1 1 10 5 NA 1 2 NA 7 NA 2 1 15 6 NA 2 2 12 NA NA 3 1 18 8 NA 5 2 11 4 NA 4 1 NA 5 NA 4 2 14 NA NA") data_to_wide( data_long, id_cols = "subject_id", names_from = "time", values_from = c("score", "anxiety", "test") ) # using the "sleepstudy" dataset data(sleepstudy, package = "lme4") # the sleepstudy data contains repeated measurements of average reaction # times for each subjects over multiple days, in a sleep deprivation study. # It is in long-format, i.e. each row corresponds to a single measurement. # The variable "Days" contains the timepoint of the measurement, and # "Reaction" contains the measurement itself. Converting this data to wide # format will create a new column for each day, with the reaction time as the # value. head(sleepstudy) data_to_wide( sleepstudy, id_cols = "Subject", names_from = "Days", values_from = "Reaction" ) # clearer column names data_to_wide( sleepstudy, id_cols = "Subject", names_from = "Days", values_from = "Reaction", names_prefix = "Reaction_Day_" ) # For unequal group sizes, missing information is filled with NA d <- subset(sleepstudy, Days \%in\% c(0, 1, 2, 3, 4))[c(1:9, 11:13, 16:17, 21), ] # long format, different number of "Subjects" d data_to_wide( d, id_cols = "Subject", names_from = "Days", values_from = "Reaction", names_prefix = "Reaction_Day_" ) \dontshow{\}) # examplesIf} } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/data_unique.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_unique.R \name{data_unique} \alias{data_unique} \title{Keep only one row from all with duplicated IDs} \usage{ data_unique( data, select = NULL, keep = "best", exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) } \arguments{ \item{data}{A data frame.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{keep}{The method to be used for duplicate selection, either "best" (the default), "first", or "last".} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ A data frame, containing only the chosen duplicates. } \description{ From all rows with at least one duplicated ID, keep only one. Methods for selecting the duplicated row are either the first duplicate, the last duplicate, or the "best" duplicate (default), based on the duplicate with the smallest number of \code{NA}. In case of ties, it picks the first duplicate, as it is the one most likely to be valid and authentic, given practice effects. Contrarily to \code{dplyr::distinct()}, \code{data_unique()} keeps all columns. } \examples{ df1 <- data.frame( id = c(1, 2, 3, 1, 3), item1 = c(NA, 1, 1, 2, 3), item2 = c(NA, 1, 1, 2, 3), item3 = c(NA, 1, 1, 2, 3) ) data_unique(df1, select = "id") } \seealso{ \code{\link[=data_duplicated]{data_duplicated()}} } ================================================ FILE: man/data_unite.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_unite.R \name{data_unite} \alias{data_unite} \title{Unite ("merge") multiple variables} \usage{ data_unite( data, new_column = NULL, select = NULL, exclude = NULL, separator = "_", append = FALSE, remove_na = FALSE, ignore_case = FALSE, verbose = TRUE, regex = FALSE, ... ) } \arguments{ \item{data}{A data frame.} \item{new_column}{The name of the new column, as a string.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{separator}{A character to use between values.} \item{append}{Logical, if \code{FALSE} (default), removes original columns that were united. If \code{TRUE}, all columns are preserved and the new column is appended to the data frame.} \item{remove_na}{Logical, if \code{TRUE}, missing values (\code{NA}) are not included in the united values. If \code{FALSE}, missing values are represented as \code{"NA"} in the united values.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{verbose}{Toggle warnings.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{...}{Currently not used.} } \value{ \code{data}, with a newly created variable. } \description{ Merge values of multiple variables per observation into one new variable. } \examples{ d <- data.frame( x = 1:3, y = letters[1:3], z = 6:8 ) d data_unite(d, new_column = "xyz") data_unite(d, new_column = "xyz", remove = FALSE) data_unite(d, new_column = "xyz", select = c("x", "z")) data_unite(d, new_column = "xyz", select = c("x", "z"), append = TRUE) } \seealso{ \code{\link[=data_separate]{data_separate()}} } ================================================ FILE: man/datawizard-package.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/datawizard-package.R \docType{package} \name{datawizard-package} \alias{datawizard-package} \alias{datawizard} \title{datawizard: Easy Data Wrangling and Statistical Transformations} \description{ A lightweight package to assist in key steps involved in any data analysis workflow: \itemize{ \item wrangling the raw data to get it in the needed form, \item applying preprocessing steps and statistical transformations, and \item compute statistical summaries of data properties and distributions. } It is also the data wrangling backend for packages in 'easystats' ecosystem. Reference: Patil et al. (2022) \doi{10.21105/joss.04684}. } \details{ \code{datawizard} } \seealso{ Useful links: \itemize{ \item \url{https://easystats.github.io/datawizard/} \item Report bugs at \url{https://github.com/easystats/datawizard/issues} } } \author{ \strong{Maintainer}: Etienne Bacher \email{etienne.bacher@protonmail.com} (\href{https://orcid.org/0000-0002-9271-5075}{ORCID}) Authors: \itemize{ \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID}) \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID}) \item Mattan S. Ben-Shachar \email{matanshm@post.bgu.ac.il} (\href{https://orcid.org/0000-0002-4287-4801}{ORCID}) \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) } Other contributors: \itemize{ \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) [contributor] \item Thomas J. Faulkenberry \email{faulkenberry@tarleton.edu} [reviewer] \item Robert Garrett \email{rcg4@illinois.edu} [reviewer] } } \keyword{internal} ================================================ FILE: man/demean.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/demean.R \name{demean} \alias{demean} \alias{degroup} \alias{detrend} \title{Compute group-meaned and de-meaned variables} \usage{ demean( x, select, by, nested = FALSE, suffix_demean = "_within", suffix_groupmean = "_between", append = TRUE, add_attributes = TRUE, verbose = TRUE ) degroup( x, select, by, nested = FALSE, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", append = TRUE, add_attributes = TRUE, verbose = TRUE ) detrend( x, select, by, nested = FALSE, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", append = TRUE, add_attributes = TRUE, verbose = TRUE ) } \arguments{ \item{x}{A data frame.} \item{select}{Character vector (or formula) with names of variables to select that should be group- and de-meaned.} \item{by}{Character vector (or formula) with the name of the variable that indicates the group- or cluster-ID. For cross-classified or nested designs, \code{by} can also identify two or more variables as group- or cluster-IDs. If the data is nested and should be treated as such, set \code{nested = TRUE}. Else, if \code{by} defines two or more variables and \code{nested = FALSE}, a cross-classified design is assumed. Note that \code{demean()} and \code{degroup()} can't handle a mix of nested and cross-classified designs in one model. For nested designs, \code{by} can be: \itemize{ \item a character vector with the name of the variable that indicates the levels, ordered from \emph{highest} level to \emph{lowest} (e.g. \code{by = c("L4", "L3", "L2")}. \item a character vector with variable names in the format \code{by = "L4/L3/L2"}, where the levels are separated by \code{/}. } See also section \emph{De-meaning for cross-classified designs} and \emph{De-meaning for nested designs} below.} \item{nested}{Logical, if \code{TRUE}, the data is treated as nested. If \code{FALSE}, the data is treated as cross-classified. Only applies if \code{by} contains more than one variable.} \item{suffix_demean, suffix_groupmean}{String value, will be appended to the names of the group-meaned and de-meaned variables of \code{x}. By default, de-meaned variables will be suffixed with \code{"_within"} and grouped-meaned variables with \code{"_between"}.} \item{append}{Logical, if \code{TRUE} (default), the group- and de-meaned variables will be appended (column bind) to the original data \code{x}, thus returning both the original and the de-/group-meaned variables.} \item{add_attributes}{Logical, if \code{TRUE}, the returned variables gain attributes to indicate the within- and between-effects. This is only relevant when printing \code{model_parameters()} - in such cases, the within- and between-effects are printed in separated blocks.} \item{verbose}{Toggle warnings and messages.} \item{center}{Method for centering. \code{demean()} always performs mean-centering, while \code{degroup()} can use \code{center = "median"} or \code{center = "mode"} for median- or mode-centering, and also \code{"min"} or \code{"max"}.} } \value{ A data frame with the group-/de-meaned variables, which get the suffix \code{"_between"} (for the group-meaned variable) and \code{"_within"} (for the de-meaned variable) by default. For cross-classified or nested designs, the name pattern of the group-meaned variables is the name of the centered variable followed by the name of the variable that indicates the related grouping level, e.g. \code{predictor_L3_between} and \code{predictor_L2_between}. } \description{ \code{demean()} computes group- and de-meaned versions of a variable that can be used in regression analysis to model the between- and within-subject effect (person-mean centering or centering within clusters). \code{degroup()} is more generic in terms of the centering-operation. While \code{demean()} always uses mean-centering, \code{degroup()} can also use the mode or median for centering. } \section{Heterogeneity Bias}{ Mixed models include different levels of sources of variability, i.e. error terms at each level. When macro-indicators (or level-2 predictors, or higher-level units, or more general: \emph{group-level predictors that \strong{vary} within and across groups}) are included as fixed effects (i.e. treated as covariate at level-1), the variance that is left unaccounted for this covariate will be absorbed into the error terms of level-1 and level-2 (\emph{Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6.}): "Such covariates contain two parts: one that is specific to the higher-level entity that does not vary between occasions, and one that represents the difference between occasions, within higher-level entities" (\emph{Bell et al. 2015}). Hence, the error terms will be correlated with the covariate, which violates one of the assumptions of mixed models (iid, independent and identically distributed error terms). This bias is also called the \emph{heterogeneity bias} (\emph{Bell et al. 2015}). To resolve this problem, level-2 predictors used as (level-1) covariates should be separated into their "within" and "between" effects by "de-meaning" and "group-meaning": After demeaning time-varying predictors, "at the higher level, the mean term is no longer constrained by Level 1 effects, so it is free to account for all the higher-level variance associated with that variable" (\emph{Bell et al. 2015}). } \section{Panel data and correlating fixed and group effects}{ \code{demean()} is intended to create group- and de-meaned variables for panel regression models (fixed effects models), or for complex random-effect-within-between models (see \emph{Bell et al. 2015, 2018}), where group-effects (random effects) and fixed effects correlate (see \emph{Bafumi and Gelman 2006}). This can happen, for instance, when analyzing panel data, which can lead to \emph{Heterogeneity Bias}. To control for correlating predictors and group effects, it is recommended to include the group-meaned and de-meaned version of \emph{time-varying covariates} (and group-meaned version of \emph{time-invariant covariates} that are on a higher level, e.g. level-2 predictors) in the model. By this, one can fit complex multilevel models for panel data, including time-varying predictors, time-invariant predictors and random effects. } \section{Why mixed models are preferred over fixed effects models}{ A mixed models approach can model the causes of endogeneity explicitly by including the (separated) within- and between-effects of time-varying fixed effects and including time-constant fixed effects. Furthermore, mixed models also include random effects, thus a mixed models approach is superior to classic fixed-effects models, which lack information of variation in the group-effects or between-subject effects. Furthermore, fixed effects regression cannot include random slopes, which means that fixed effects regressions are neglecting "cross-cluster differences in the effects of lower-level controls (which) reduces the precision of estimated context effects, resulting in unnecessarily wide confidence intervals and low statistical power" (\emph{Heisig et al. 2017}). } \section{Terminology}{ The group-meaned variable is simply the mean of an independent variable within each group (or id-level or cluster) represented by \code{by}. It represents the cluster-mean of an independent variable. The regression coefficient of a group-meaned variable is the \emph{between-subject-effect}. The de-meaned variable is then the centered version of the group-meaned variable. De-meaning is sometimes also called person-mean centering or centering within clusters. The regression coefficient of a de-meaned variable represents the \emph{within-subject-effect}. } \section{De-meaning with continuous predictors}{ For continuous time-varying predictors, the recommendation is to include both their de-meaned and group-meaned versions as fixed effects, but not the raw (untransformed) time-varying predictors themselves. The de-meaned predictor should also be included as random effect (random slope). In regression models, the coefficient of the de-meaned predictors indicates the within-subject effect, while the coefficient of the group-meaned predictor indicates the between-subject effect. } \section{De-meaning with binary predictors}{ For binary time-varying predictors, there are two recommendations. First is to include the raw (untransformed) binary predictor as fixed effect only and the \emph{de-meaned} variable as random effect (random slope). The alternative would be to add the de-meaned version(s) of binary time-varying covariates as additional fixed effect as well (instead of adding it as random slope). Centering time-varying binary variables to obtain within-effects (level 1) isn't necessary. They have a sensible interpretation when left in the typical 0/1 format (\emph{Hoffmann 2015, chapter 8-2.I}). \code{demean()} will thus coerce categorical time-varying predictors to numeric to compute the de- and group-meaned versions for these variables, where the raw (untransformed) binary predictor and the de-meaned version should be added to the model. } \section{De-meaning of factors with more than 2 levels}{ Factors with more than two levels are demeaned in two ways: first, these are also converted to numeric and de-meaned; second, dummy variables are created (binary, with 0/1 coding for each level) and these binary dummy-variables are de-meaned in the same way (as described above). Packages like \strong{panelr} internally convert factors to dummies before demeaning, so this behaviour can be mimicked here. } \section{De-meaning interaction terms}{ There are multiple ways to deal with interaction terms of within- and between-effects. \itemize{ \item A classical approach is to simply use the product term of the de-meaned variables (i.e. introducing the de-meaned variables as interaction term in the model formula, e.g. \code{y ~ x_within * time_within}). This approach, however, might be subject to bias (see \emph{Giesselmann & Schmidt-Catran 2020}). \item Another option is to first calculate the product term and then apply the de-meaning to it. This approach produces an estimator "that reflects unit-level differences of interacted variables whose moderators vary within units", which is desirable if \emph{no} within interaction of two time-dependent variables is required. This is what \code{demean()} does internally when \code{select} contains interaction terms. \item A third option, when the interaction should result in a genuine within estimator, is to "double de-mean" the interaction terms (\emph{Giesselmann & Schmidt-Catran 2018}), however, this is currently not supported by \code{demean()}. If this is required, the \code{wmb()} function from the \strong{panelr} package should be used. } To de-mean interaction terms for within-between models, simply specify the term as interaction for the \code{select}-argument, e.g. \code{select = "a*b"} (see 'Examples'). } \section{De-meaning for cross-classified designs}{ \code{demean()} can handle cross-classified designs, where the data has two or more groups at the higher (i.e. second) level. In such cases, the \code{by}-argument can identify two or more variables that represent the cross-classified group- or cluster-IDs. The de-meaned variables for cross-classified designs are simply subtracting all group means from each individual value, i.e. \emph{fully cluster-mean-centering} (see \emph{Guo et al. 2024} for details). Note that de-meaning for cross-classified designs is \emph{not} equivalent to de-meaning of nested data structures from models with three or more levels. Set \code{nested = TRUE} to explicitly assume a nested design. For cross-classified designs, de-meaning is supposed to work for models like \code{y ~ x + (1|level3) + (1|level2)}, but \emph{not} for models like \code{y ~ x + (1|level3/level2)}. Note that \code{demean()} and \code{degroup()} can't handle a mix of nested and cross-classified designs in one model. } \section{De-meaning for nested designs}{ \emph{Brincks et al. (2017)} have suggested an algorithm to center variables for nested designs, which is implemented in \code{demean()}. For nested designs, set \code{nested = TRUE} \emph{and} specify the variables that indicate the different levels in descending order in the \code{by} argument. E.g., \verb{by = c("level4", "level3, "level2")} assumes a model like \code{y ~ x + (1|level4/level3/level2)}. An alternative notation for the \code{by}-argument would be \code{by = "level4/level3/level2"}, similar to the formula notation. } \section{Analysing panel data with mixed models using lme4}{ A description of how to translate the formulas described in \emph{Bell et al. 2018} into R using \code{lmer()} from \strong{lme4} can be found in \href{https://easystats.github.io/parameters/articles/demean.html}{this vignette}. } \examples{ data(iris) iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID iris$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID") head(x) x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID") head(x) # demean interaction term x*y dat <- data.frame( a = c(1, 2, 3, 4, 1, 2, 3, 4), x = c(4, 3, 3, 4, 1, 2, 1, 2), y = c(1, 2, 1, 2, 4, 3, 2, 1), ID = c(1, 2, 3, 1, 2, 3, 1, 2) ) demean(dat, select = c("a", "x*y"), by = "ID") # or in formula-notation demean(dat, select = ~ a + x * y, by = ~ID) } \references{ \itemize{ \item Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the American Political Science Association. \item Bell A, Fairbrother M, Jones K. 2019. Fixed and Random Effects Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074 \item Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects Modeling of Time-Series Cross-Sectional and Panel Data. Political Science Research and Methods, 3(1), 133–153. \item Brincks, A. M., Enders, C. K., Llabre, M. M., Bulotsky-Shearer, R. J., Prado, G., and Feaster, D. J. (2017). Centering Predictor Variables in Three-Level Contextual Models. Multivariate Behavioral Research, 52(2), 149–163. https://doi.org/10.1080/00273171.2016.1256753 \item Gelman A, Hill J. 2007. Data Analysis Using Regression and Multilevel/Hierarchical Models. Analytical Methods for Social Research. Cambridge, New York: Cambridge University Press \item Giesselmann M, Schmidt-Catran, AW. 2020. Interactions in fixed effects regression models. Sociological Methods & Research, 1–28. https://doi.org/10.1177/0049124120914934 \item Guo Y, Dhaliwal J, Rights JD. 2024. Disaggregating level-specific effects in cross-classified multilevel models. Behavior Research Methods, 56(4), 3023–3057. \item Heisig JP, Schaeffer M, Giesecke J. 2017. The Costs of Simplicity: Why Multilevel Models May Benefit from Accounting for Cross-Cluster Differences in the Effects of Controls. American Sociological Review 82 (4): 796–827. \item Hoffman L. 2015. Longitudinal analysis: modeling within-person fluctuation and change. New York: Routledge } } \seealso{ If grand-mean centering (instead of centering within-clusters) is required, see \code{\link[=center]{center()}}. See \code{\link[performance:check_group_variation]{performance::check_group_variation()}} to check for heterogeneity bias. } ================================================ FILE: man/describe_distribution.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/describe_distribution.R \name{describe_distribution} \alias{describe_distribution} \alias{describe_distribution.numeric} \alias{describe_distribution.factor} \alias{describe_distribution.data.frame} \title{Describe a distribution} \usage{ describe_distribution(x, ...) \method{describe_distribution}{numeric}( x, centrality = "mean", dispersion = TRUE, iqr = TRUE, range = TRUE, quartiles = FALSE, ci = NULL, iterations = 100, threshold = 0.1, verbose = TRUE, ... ) \method{describe_distribution}{factor}(x, dispersion = TRUE, range = TRUE, verbose = TRUE, ...) \method{describe_distribution}{data.frame}( x, select = NULL, exclude = NULL, centrality = "mean", dispersion = TRUE, iqr = TRUE, range = TRUE, quartiles = FALSE, include_factors = FALSE, ci = NULL, iterations = 100, threshold = 0.1, ignore_case = FALSE, regex = FALSE, verbose = TRUE, by = NULL, ... ) } \arguments{ \item{x}{A numeric vector, a character vector, a data frame, or a list. See \code{Details}.} \item{...}{Additional arguments to be passed to or from methods.} \item{centrality}{The point-estimates (centrality indices) to compute. Character (vector) or list with one or more of these options: \code{"median"}, \code{"mean"}, \code{"MAP"} (see \code{\link[bayestestR:map_estimate]{map_estimate()}}), \code{"trimmed"} (which is just \code{mean(x, trim = threshold)}), \code{"mode"} or \code{"all"}.} \item{dispersion}{Logical, if \code{TRUE}, computes indices of dispersion related to the estimate(s) (\code{SD} and \code{MAD} for \code{mean} and \code{median}, respectively). Dispersion is not available for \code{"MAP"} or \code{"mode"} centrality indices.} \item{iqr}{Logical, if \code{TRUE}, the interquartile range is calculated (based on \code{\link[stats:IQR]{stats::IQR()}}, using \code{type = 6}).} \item{range}{Return the range (min and max).} \item{quartiles}{Return the first and third quartiles (25th and 75th percentiles).} \item{ci}{Confidence Interval (CI) level. Default is \code{NULL}, i.e. no confidence intervals are computed. If not \code{NULL}, confidence intervals are based on bootstrap replicates (see \code{iterations}).} \item{iterations}{The number of bootstrap replicates for computing confidence intervals. Only applies when \code{ci} is not \code{NULL}. Defaults to \code{100}. For more stable results, increase the number of \code{iterations}, but note that this can also increase the computation time significantly.} \item{threshold}{For \code{centrality = "trimmed"} (i.e. trimmed mean), indicates the fraction (0 to 0.5) of observations to be trimmed from each end of the vector before the mean is computed.} \item{verbose}{Show or silence warnings and messages.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{include_factors}{Logical, if \code{TRUE}, factors are included in the output, however, only columns for range (first and last factor levels) as well as n and missing will contain information.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{by}{Column names indicating how to split the data in various groups before describing the distribution. \code{by} groups will be added to potentially existing groups created by \code{data_group()}.} } \value{ A data frame with columns that describe the properties of the variables. } \description{ This function describes a distribution by a set of indices (e.g., measures of centrality, dispersion, range, skewness, (excess) kurtosis). } \details{ If \code{x} is a data frame, only numeric variables are kept and will be displayed in the summary by default. If \code{x} is a list, the behavior is different whether \code{x} is a stored list. If \code{x} is stored (for example, \code{describe_distribution(mylist)} where \code{mylist} was created before), artificial variable names are used in the summary (\code{Var_1}, \code{Var_2}, etc.). If \code{x} is an unstored list (for example, \code{describe_distribution(list(mtcars$mpg))}), then \code{"mtcars$mpg"} is used as variable name. } \note{ There is also a \href{https://easystats.github.io/see/articles/parameters.html}{\code{plot()}-method} implemented in the \href{https://easystats.github.io/see/}{\strong{see}-package}. } \examples{ \dontshow{if (require("bayestestR", quietly = TRUE)) withAutoprint(\{ # examplesIf} describe_distribution(rnorm(100)) data(iris) describe_distribution(iris) describe_distribution(iris, include_factors = TRUE, quartiles = TRUE) describe_distribution(list(mtcars$mpg, mtcars$cyl)) \dontshow{\}) # examplesIf} } \seealso{ \code{\link[=kurtosis]{kurtosis()}} to compute kurtosis (recognized as excess kurtosis). } ================================================ FILE: man/distribution_mode.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/descriptives.R \name{distribution_mode} \alias{distribution_mode} \title{Compute mode for a statistical distribution} \usage{ distribution_mode(x) } \arguments{ \item{x}{An atomic vector, a list, or a data frame.} } \value{ The value that appears most frequently in the provided data. The returned data structure will be the same as the entered one. } \description{ Compute mode for a statistical distribution } \examples{ distribution_mode(c(1, 2, 3, 3, 4, 5)) distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)) } \seealso{ For continuous variables, the \strong{Highest Maximum a Posteriori probability estimate (MAP)} may be a more useful way to estimate the most commonly-observed value than the mode. See \code{\link[bayestestR:map_estimate]{bayestestR::map_estimate()}}. } ================================================ FILE: man/efc.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{efc} \alias{efc} \title{Sample dataset from the EFC Survey} \description{ Selected variables from the EUROFAMCARE survey. Useful when testing on "real-life" data sets, including random missing values. This data set also has value and variable label attributes. } \keyword{data} ================================================ FILE: man/extract_column_names.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_select.R, R/extract_column_names.R \name{data_select} \alias{data_select} \alias{extract_column_names} \alias{find_columns} \title{Find or get columns in a data frame based on search patterns} \usage{ data_select( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) extract_column_names( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) find_columns( data, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{data}{A data frame.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} \item{...}{Arguments passed down to other functions. Mostly not used yet.} } \value{ \code{extract_column_names()} returns a character vector with column names that matched the pattern in \code{select} and \code{exclude}, or \code{NULL} if no matching column name was found. \code{data_select()} returns a data frame with matching columns. } \description{ \code{extract_column_names()} returns column names from a data set that match a certain search pattern, while \code{data_select()} returns the found data. } \details{ Specifically for \code{data_select()}, \code{select} can also be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Examples'. Note that it is possible to either pass an entire select helper or only the pattern inside a select helper as a function argument: \if{html}{\out{
}}\preformatted{foo <- function(data, pattern) \{ extract_column_names(data, select = starts_with(pattern)) \} foo(iris, pattern = "Sep") foo2 <- function(data, pattern) \{ extract_column_names(data, select = pattern) \} foo2(iris, pattern = starts_with("Sep")) }\if{html}{\out{
}} This means that it is also possible to use loop values as arguments or patterns: \if{html}{\out{
}}\preformatted{for (i in c("Sepal", "Sp")) \{ head(iris) |> extract_column_names(select = starts_with(i)) |> print() \} }\if{html}{\out{
}} However, this behavior is limited to a "single-level function". It will not work in nested functions, like below: \if{html}{\out{
}}\preformatted{inner <- function(data, arg) \{ extract_column_names(data, select = arg) \} outer <- function(data, arg) \{ inner(data, starts_with(arg)) \} outer(iris, "Sep") }\if{html}{\out{
}} In this case, it is better to pass the whole select helper as the argument of \code{outer()}: \if{html}{\out{
}}\preformatted{outer <- function(data, arg) \{ inner(data, arg) \} outer(iris, starts_with("Sep")) }\if{html}{\out{
}} } \examples{ # Find column names by pattern extract_column_names(iris, starts_with("Sepal")) extract_column_names(iris, ends_with("Width")) extract_column_names(iris, regex("\\\\.")) extract_column_names(iris, c("Petal.Width", "Sepal.Length")) # starts with "Sepal", but not allowed to end with "width" extract_column_names(iris, starts_with("Sepal"), exclude = contains("Width")) # find numeric with mean > 3.5 numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5 extract_column_names(iris, numeric_mean_35) # find column names, using range extract_column_names(mtcars, c(cyl:hp, wt)) # find range of column names by range, using character vector extract_column_names(mtcars, c("cyl:hp", "wt")) # rename returned columns for "data_select()" head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl"))) } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/labels_to_levels.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/labels_to_levels.R \name{labels_to_levels} \alias{labels_to_levels} \alias{labels_to_levels.factor} \alias{labels_to_levels.data.frame} \title{Convert value labels into factor levels} \usage{ labels_to_levels(x, ...) \method{labels_to_levels}{factor}(x, verbose = TRUE, ...) \method{labels_to_levels}{data.frame}( x, select = NULL, exclude = NULL, ignore_case = FALSE, append = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A data frame or factor. Other variable types (e.g. numerics) are not allowed.} \item{...}{Currently not used.} \item{verbose}{Toggle warnings.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{append}{Logical or string. If \code{TRUE}, recoded or converted variables get new column names and are appended (column bind) to \code{x}, thus returning both the original and the recoded variables. The new columns get a suffix, based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for \code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for \code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be overwritten by their recoded versions. If a character value, recoded variables are appended with new column names (using the defined suffix) to the original data frame.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ \code{x}, where for all factors former levels are replaced by their value labels. } \description{ Convert value labels into factor levels } \details{ \code{labels_to_levels()} allows to use value labels of factors as their levels. } \examples{ data(efc) # create factor x <- as.factor(efc$c172code) # add value labels - these are not factor levels yet x <- assign_labels(x, values = c(`1` = "low", `2` = "mid", `3` = "high")) levels(x) data_tabulate(x) x <- labels_to_levels(x) levels(x) data_tabulate(x) } ================================================ FILE: man/makepredictcall.dw_transformer.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/makepredictcall.R \name{makepredictcall.dw_transformer} \alias{makepredictcall.dw_transformer} \title{Utility Function for Safe Prediction with \code{datawizard} transformers} \usage{ \method{makepredictcall}{dw_transformer}(var, call) } \arguments{ \item{var}{A variable.} \item{call}{The term in the formula, as a call.} } \value{ A replacement for \code{call} for the \code{predvars} attribute of the terms. } \description{ This function allows for the use of (some of) \code{datawizard}'s transformers inside a model formula. See examples below. \cr\cr Currently, \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, & \code{\link[=rescale]{rescale()}} are supported. } \examples{ data("mtcars") train <- mtcars[1:30, ] test <- mtcars[31:32, ] m1 <- lm(mpg ~ center(hp), data = train) predict(m1, newdata = test) # Data is "centered" before the prediction is made, # according to the center of the old data m2 <- lm(mpg ~ standardize(hp), data = train) m3 <- lm(mpg ~ scale(hp), data = train) # same as above predict(m2, newdata = test) # Data is "standardized" before the prediction is made. predict(m3, newdata = test) # Data is "standardized" before the prediction is made. m4 <- lm(mpg ~ normalize(hp), data = mtcars) m5 <- lm(mpg ~ rescale(hp, to = c(-3, 3)), data = mtcars) (newdata <- data.frame(hp = c(range(mtcars$hp), 400))) # 400 is outside original range! model.frame(delete.response(terms(m4)), data = newdata) model.frame(delete.response(terms(m5)), data = newdata) } \seealso{ \code{\link[stats:makepredictcall]{stats::makepredictcall()}} } \concept{datawizard-transformers} ================================================ FILE: man/mean_sd.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mean_sd.R \name{mean_sd} \alias{mean_sd} \alias{median_mad} \title{Summary Helpers} \usage{ mean_sd(x, times = 1L, remove_na = TRUE, named = TRUE, ...) median_mad( x, times = 1L, remove_na = TRUE, constant = 1.4826, named = TRUE, ... ) } \arguments{ \item{x}{A numeric vector (or one that can be coerced to one via \code{as.numeric()}) to be summarized.} \item{times}{How many SDs above and below the Mean (or MADs around the Median)} \item{remove_na}{Logical. Should \code{NA} values be removed before computing (\code{TRUE}) or not (\code{FALSE}, default)?} \item{named}{Should the vector be named? (E.g., \code{c("-SD" = -1, Mean = 1, "+SD" = 2)}.)} \item{...}{Not used.} \item{constant}{scale factor.} } \value{ A (possibly named) numeric vector of length \code{2*times + 1} of SDs below the mean, the mean, and SDs above the mean (or median and MAD). } \description{ Summary Helpers } \examples{ mean_sd(mtcars$mpg) mean_sd(mtcars$mpg, times = 2L) median_mad(mtcars$mpg) } ================================================ FILE: man/means_by_group.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/means_by_group.R \name{means_by_group} \alias{means_by_group} \alias{means_by_group.numeric} \alias{means_by_group.data.frame} \title{Summary of mean values by group} \usage{ means_by_group(x, ...) \method{means_by_group}{numeric}(x, by = NULL, ci = 0.95, weights = NULL, digits = NULL, ...) \method{means_by_group}{data.frame}( x, select = NULL, by = NULL, ci = 0.95, weights = NULL, digits = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A vector or a data frame.} \item{...}{Currently not used} \item{by}{If \code{x} is a numeric vector, \code{by} should be a factor that indicates the group-classifying categories. If \code{x} is a data frame, \code{by} should be a character string, naming the variable in \code{x} that is used for grouping. Numeric vectors are coerced to factors. Not that \code{by} should only refer to a single variable.} \item{ci}{Level of confidence interval for mean estimates. Default is \code{0.95}. Use \code{ci = NA} to suppress confidence intervals.} \item{weights}{If \code{x} is a numeric vector, \code{weights} should be a vector of weights that will be applied to weight all observations. If \code{x} is a data frame, \code{weights} can also be a character string indicating the name of the variable in \code{x} that should be used for weighting. Default is \code{NULL}, so no weights are used.} \item{digits}{Optional scalar, indicating the amount of digits after decimal point when rounding estimates and values.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ A data frame with information on mean and further summary statistics for each sub-group. } \description{ Computes summary table of means by groups. } \details{ This function is comparable to \code{aggregate(x, by, mean)}, but provides some further information, including summary statistics from a One-Way-ANOVA using \code{x} as dependent and \code{by} as independent variable. \code{\link[emmeans:contrast]{emmeans::contrast()}} is used to get p-values for each sub-group. P-values indicate whether each group-mean is significantly different from the total mean. } \examples{ data(efc) means_by_group(efc, "c12hour", "e42dep") data(iris) means_by_group(iris, "Sepal.Width", "Species") # weighting efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) means_by_group(efc, "c12hour", "e42dep", weights = "weight") } ================================================ FILE: man/nhanes_sample.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{nhanes_sample} \alias{nhanes_sample} \title{Sample dataset from the National Health and Nutrition Examination Survey} \description{ Selected variables from the National Health and Nutrition Examination Survey that are used in the example from Lumley (2010), Appendix E. } \references{ Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley } \keyword{data} ================================================ FILE: man/normalize.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/normalize.R, R/unnormalize.R \name{normalize} \alias{normalize} \alias{normalize.numeric} \alias{normalize.data.frame} \alias{unnormalize} \alias{unnormalize.numeric} \alias{unnormalize.data.frame} \alias{unnormalize.grouped_df} \title{Normalize numeric variable to 0-1 range} \usage{ normalize(x, ...) \method{normalize}{numeric}(x, include_bounds = TRUE, verbose = TRUE, ...) \method{normalize}{data.frame}( x, select = NULL, exclude = NULL, include_bounds = TRUE, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) unnormalize(x, ...) \method{unnormalize}{numeric}(x, verbose = TRUE, ...) \method{unnormalize}{data.frame}( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) \method{unnormalize}{grouped_df}( x, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A numeric vector, (grouped) data frame, or matrix. See 'Details'.} \item{...}{Arguments passed to or from other methods.} \item{include_bounds}{Numeric or logical. Using this can be useful in case of beta-regression, where the response variable is not allowed to include zeros and ones. If \code{TRUE}, the input is normalized to a range that includes zero and one. If \code{FALSE}, the return value is compressed, using Smithson and Verkuilen's (2006) formula \code{(x * (n - 1) + 0.5) / n}, to avoid zeros and ones in the normalized variables. Else, if numeric (e.g., \code{0.001}), \code{include_bounds} defines the "distance" to the lower and upper bound, i.e. the normalized vectors are rescaled to a range from \code{0 + include_bounds} to \code{1 - include_bounds}.} \item{verbose}{Toggle warnings and messages on or off.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{append}{Logical or string. If \code{TRUE}, standardized variables get new column names (with the suffix \code{"_z"}) and are appended (column bind) to \code{x}, thus returning both the original and the standardized variables. If \code{FALSE}, original variables in \code{x} will be overwritten by their standardized versions. If a character value, standardized variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ A normalized object. } \description{ Performs a normalization of data, i.e., it scales variables in the range 0 - 1. This is a special case of \code{\link[=rescale]{rescale()}}. \code{unnormalize()} is the counterpart, but only works for variables that have been normalized with \code{normalize()}. } \details{ \itemize{ \item If \code{x} is a matrix, normalization is performed across all values (not column- or row-wise). For column-wise normalization, convert the matrix to a data.frame. \item If \code{x} is a grouped data frame (\code{grouped_df}), normalization is performed separately for each group. } } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ normalize(c(0, 1, 5, -5, -2)) normalize(c(0, 1, 5, -5, -2), include_bounds = FALSE) # use a value defining the bounds normalize(c(0, 1, 5, -5, -2), include_bounds = 0.001) head(normalize(trees)) } \references{ Smithson M, Verkuilen J (2006). A Better Lemon Squeezer? Maximum-Likelihood Regression with Beta-Distributed Dependent Variables. Psychological Methods, 11(1), 54–71. } \seealso{ See \code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas. Other transform utilities: \code{\link{ranktransform}()}, \code{\link{rescale}()}, \code{\link{reverse}()}, \code{\link{standardize}()} } \concept{transform utilities} ================================================ FILE: man/ranktransform.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ranktransform.R \name{ranktransform} \alias{ranktransform} \alias{ranktransform.numeric} \alias{ranktransform.data.frame} \title{(Signed) rank transformation} \usage{ ranktransform(x, ...) \method{ranktransform}{numeric}( x, sign = FALSE, method = "average", zeros = "na", verbose = TRUE, ... ) \method{ranktransform}{data.frame}( x, select = NULL, exclude = NULL, sign = FALSE, method = "average", ignore_case = FALSE, regex = FALSE, zeros = "na", verbose = TRUE, ... ) } \arguments{ \item{x}{Object.} \item{...}{Arguments passed to or from other methods.} \item{sign}{Logical, if \code{TRUE}, return signed ranks.} \item{method}{Treatment of ties. Can be one of \code{"average"} (default), \code{"first"}, \code{"last"}, \code{"random"}, \code{"max"} or \code{"min"}. See \code{\link[=rank]{rank()}} for details.} \item{zeros}{How to handle zeros. If \code{"na"} (default), they are marked as \code{NA}. If \code{"signrank"}, they are kept during the ranking and marked as zeros. This is only used when \code{sign = TRUE}.} \item{verbose}{Toggle warnings.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ A rank-transformed object. } \description{ Transform numeric values with the integers of their rank (i.e., 1st smallest, 2nd smallest, 3rd smallest, etc.). Setting the \code{sign} argument to \code{TRUE} will give you signed ranks, where the ranking is done according to absolute size but where the sign is preserved (i.e., 2, 1, -3, 4). } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ ranktransform(c(0, 1, 5, -5, -2)) # By default, zeros are converted to NA suppressWarnings( ranktransform(c(0, 1, 5, -5, -2), sign = TRUE) ) ranktransform(c(0, 1, 5, -5, -2), sign = TRUE, zeros = "signrank") head(ranktransform(trees)) } \seealso{ Other transform utilities: \code{\link{normalize}()}, \code{\link{rescale}()}, \code{\link{reverse}()}, \code{\link{standardize}()} } \concept{transform utilities} ================================================ FILE: man/recode_into.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/recode_into.R \name{recode_into} \alias{recode_into} \title{Recode values from one or more variables into a new variable} \usage{ recode_into( ..., data = NULL, default = NA, overwrite = TRUE, preserve_na = FALSE, verbose = TRUE ) } \arguments{ \item{...}{A sequence of two-sided formulas, where the left hand side (LHS) is a logical matching condition that determines which values match this case. The LHS of this formula is also called "recode pattern" (e.g., in messages). The right hand side (RHS) indicates the replacement value.} \item{data}{Optional, name of a data frame. This can be used to avoid writing the data name multiple times in \code{...}. See 'Examples'.} \item{default}{Indicates the default value that is chosen when no match in the formulas in \code{...} is found. If not provided, \code{NA} is used as default value.} \item{overwrite}{Logical, if \code{TRUE} (default) and more than one recode pattern apply to the same case, already recoded values will be overwritten by subsequent recode patterns. If \code{FALSE}, former recoded cases will not be altered by later recode patterns that would apply to those cases again. A warning message is printed to alert such situations and to avoid unintentional recodings.} \item{preserve_na}{Logical, if \code{TRUE} and \code{default} is not \code{NA}, missing values in the original variable will be set back to \code{NA} in the recoded variable (unless overwritten by other recode patterns). If \code{FALSE}, missing values in the original variable will be recoded to \code{default}. Setting \code{preserve_na = TRUE} prevents unintentional overwriting of missing values with \code{default}, which means that you won't find valid values where the original data only had missing values. See 'Examples'.} \item{verbose}{Toggle warnings.} } \value{ A vector with recoded values. } \description{ This functions recodes values from one or more variables into a new variable. It is a convenient function to avoid nested \code{\link[=ifelse]{ifelse()}} statements, which is similar to \code{dplyr::case_when()}. } \examples{ x <- 1:30 recode_into( x > 15 ~ "a", x > 10 & x <= 15 ~ "b", default = "c" ) x <- 1:10 # default behaviour: second recode pattern "x > 5" overwrites # some of the formerly recoded cases from pattern "x >= 3 & x <= 7" recode_into( x >= 3 & x <= 7 ~ 1, x > 5 ~ 2, default = 0, verbose = FALSE ) # setting "overwrite = FALSE" will not alter formerly recoded cases recode_into( x >= 3 & x <= 7 ~ 1, x > 5 ~ 2, default = 0, overwrite = FALSE, verbose = FALSE ) set.seed(123) d <- data.frame( x = sample(1:5, 30, TRUE), y = sample(letters[1:5], 30, TRUE), stringsAsFactors = FALSE ) # from different variables into new vector recode_into( d$x \%in\% 1:3 & d$y \%in\% c("a", "b") ~ 1, d$x > 3 ~ 2, default = 0 ) # no need to write name of data frame each time recode_into( x \%in\% 1:3 & y \%in\% c("a", "b") ~ 1, x > 3 ~ 2, data = d, default = 0 ) # handling of missing values d <- data.frame( x = c(1, NA, 2, NA, 3, 4), y = c(1, 11, 3, NA, 5, 6) ) # first NA in x is overwritten by valid value from y # we have no known value for second NA in x and y, # thus we get one NA in the result recode_into( x <= 3 ~ 1, y > 5 ~ 2, data = d, default = 0, preserve_na = TRUE ) # first NA in x is overwritten by valid value from y # default value is used for second NA recode_into( x <= 3 ~ 1, y > 5 ~ 2, data = d, default = 0, preserve_na = FALSE ) } ================================================ FILE: man/recode_values.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/recode_values.R \name{recode_values} \alias{recode_values} \alias{recode_values.numeric} \alias{recode_values.data.frame} \title{Recode old values of variables into new values} \usage{ recode_values(x, ...) \method{recode_values}{numeric}( x, recode = NULL, default = NULL, preserve_na = TRUE, verbose = TRUE, ... ) \method{recode_values}{data.frame}( x, select = NULL, exclude = NULL, recode = NULL, default = NULL, preserve_na = TRUE, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A data frame, numeric or character vector, or factor.} \item{...}{not used.} \item{recode}{A list of named vectors, which indicate the recode pairs. The \emph{names} of the list-elements (i.e. the left-hand side) represent the \emph{new} values, while the values of the list-elements indicate the original (old) values that should be replaced. When recoding numeric vectors, element names have to be surrounded in backticks. For example, \code{recode=list(`0`=1)} would recode all \code{1} into \code{0} in a numeric vector. See also 'Examples' and 'Details'.} \item{default}{Defines the default value for all values that have no match in the recode-pairs. If \code{NULL}, original values will be preserved when there is no match. Note that, if \code{preserve_na=FALSE}, missing values (\code{NA}) are also captured by the \code{default} argument, and thus will also be recoded into the specified value. See 'Examples' and 'Details'.} \item{preserve_na}{Logical, if \code{TRUE}, \code{NA} (missing values) are preserved. This overrides any other arguments, including \code{default}. Hence, if \code{preserve_na=TRUE}, \code{default} will no longer convert \code{NA} into the specified default value.} \item{verbose}{Toggle warnings.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{append}{Logical or string. If \code{TRUE}, recoded or converted variables get new column names and are appended (column bind) to \code{x}, thus returning both the original and the recoded variables. The new columns get a suffix, based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for \code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for \code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be overwritten by their recoded versions. If a character value, recoded variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ \code{x}, where old values are replaced by new values. } \description{ This functions recodes old values into new values and can be used to to recode numeric or character vectors, or factors. } \details{ This section describes the pattern of the \code{recode} arguments, which also provides some shortcuts, in particular when recoding numeric values. \itemize{ \item Single values Single values either need to be wrapped in backticks (in case of numeric values) or "as is" (for character or factor levels). Example: \code{recode=list(`0`=1,`1`=2)} would recode 1 into 0, and 2 into 1. For factors or character vectors, an example is: \code{recode=list(x="a",y="b")} (recode "a" into "x" and "b" into "y"). \item Multiple values Multiple values that should be recoded into a new value can be separated with comma. Example: \code{recode=list(`1`=c(1,4),`2`=c(2,3))} would recode the values 1 and 4 into 1, and 2 and 3 into 2. It is also possible to define the old values as a character string, like: \code{recode=list(`1`="1,4",`2`="2,3")} For factors or character vectors, an example is: \code{recode=list(x=c("a","b"),y=c("c","d"))}. \item Value range Numeric value ranges can be defined using the \code{:}. Example: \code{recode=list(`1`=1:3,`2`=4:6)} would recode all values from 1 to 3 into 1, and 4 to 6 into 2. \item \code{min} and \code{max} placeholder to use the minimum or maximum value of the (numeric) variable. Useful, e.g., when recoding ranges of values. Example: \code{recode=list(`1`="min:10",`2`="11:max")}. \item \code{default} values The \code{default} argument defines the default value for all values that have no match in the recode-pairs. For example, \verb{recode=list(`1`=c(1,2),`2`=c(3,4)), default=9} would recode values 1 and 2 into 1, 3 and 4 into 2, and all other values into 9. If \code{preserve_na} is set to \code{FALSE}, \code{NA} (missing values) will also be recoded into the specified default value. \item Reversing and rescaling See \code{\link[=reverse]{reverse()}} and \code{\link[=rescale]{rescale()}}. } } \note{ You can use \code{options(data_recode_pattern = "old=new")} to switch the behaviour of the \code{recode}-argument, i.e. recode-pairs are now following the pattern \verb{old values = new values}, e.g. if \code{getOption("data_recode_pattern")} is set to \code{"old=new"}, then \code{recode(`1`=0)} would recode all 1 into 0. The default for \code{recode(`1`=0)} is to recode all 0 into 1. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ # numeric ---------- set.seed(123) x <- sample(c(1:4, NA), 15, TRUE) table(x, useNA = "always") out <- recode_values(x, list(`0` = 1, `1` = 2:3, `2` = 4)) out table(out, useNA = "always") # to recode NA values, set preserve_na to FALSE out <- recode_values( x, list(`0` = 1, `1` = 2:3, `2` = 4, `9` = NA), preserve_na = FALSE ) out table(out, useNA = "always") # preserve na ---------- out <- recode_values(x, list(`0` = 1, `1` = 2:3), default = 77) out table(out, useNA = "always") # recode na into default ---------- out <- recode_values( x, list(`0` = 1, `1` = 2:3), default = 77, preserve_na = FALSE ) out table(out, useNA = "always") # factors (character vectors are similar) ---------- set.seed(123) x <- as.factor(sample(c("a", "b", "c"), 15, TRUE)) table(x) out <- recode_values(x, list(x = "a", y = c("b", "c"))) out table(out) out <- recode_values(x, list(x = "a", y = "b", z = "c")) out table(out) out <- recode_values(x, list(y = "b,c"), default = 77) # same as # recode_values(x, list(y = c("b", "c")), default = 77) out table(out) # data frames ---------- set.seed(123) d <- data.frame( x = sample(c(1:4, NA), 12, TRUE), y = as.factor(sample(c("a", "b", "c"), 12, TRUE)), stringsAsFactors = FALSE ) recode_values( d, recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = c("b", "c")), append = TRUE ) # switch recode pattern to "old=new" ---------- options(data_recode_pattern = "old=new") # numeric set.seed(123) x <- sample(c(1:4, NA), 15, TRUE) table(x, useNA = "always") out <- recode_values(x, list(`1` = 0, `2:3` = 1, `4` = 2)) table(out, useNA = "always") # factors (character vectors are similar) set.seed(123) x <- as.factor(sample(c("a", "b", "c"), 15, TRUE)) table(x) out <- recode_values(x, list(a = "x", `b, c` = "y")) table(out) # reset options options(data_recode_pattern = NULL) } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/reexports.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_tabulate.R \docType{import} \name{reexports} \alias{reexports} \alias{print_html} \alias{print_md} \alias{display} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{insight}{\code{\link[insight]{display}}, \code{\link[insight:display]{print_html}}, \code{\link[insight:display]{print_md}}} }} ================================================ FILE: man/remove_empty.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/remove_empty.R \name{remove_empty} \alias{remove_empty} \alias{empty_columns} \alias{empty_rows} \alias{remove_empty_columns} \alias{remove_empty_rows} \title{Return or remove variables or observations that are completely missing} \usage{ empty_columns(x) empty_rows(x) remove_empty_columns(x) remove_empty_rows(x) remove_empty(x) } \arguments{ \item{x}{A data frame.} } \value{ \itemize{ \item For \code{empty_columns()} and \code{empty_rows()}, a numeric (named) vector with row or column indices of those variables that completely have missing values. \item For \code{remove_empty_columns()} and \code{remove_empty_rows()}, a data frame with "empty" columns or rows removed, respectively. \item For \code{remove_empty()}, \strong{both} empty rows and columns will be removed. } } \description{ These functions check which rows or columns of a data frame completely contain missing values, i.e. which observations or variables completely have missing values, and either (1) returns their indices; or (2) removes them from the data frame. } \details{ For character vectors, empty string values (i.e. \code{""}) are also considered as missing value. Thus, if a character vector only contains \code{NA} and \code{""}, it is considered as empty variable and will be removed. Same applies to observations (rows) that only contain \code{NA} or \code{""}. } \examples{ tmp <- data.frame( a = c(1, 2, 3, NA, 5), b = c(1, NA, 3, NA, 5), c = c(NA, NA, NA, NA, NA), d = c(1, NA, 3, NA, 5) ) tmp # indices of empty columns or rows empty_columns(tmp) empty_rows(tmp) # remove empty columns or rows remove_empty_columns(tmp) remove_empty_rows(tmp) # remove empty columns and rows remove_empty(tmp) # also remove "empty" character vectors tmp <- data.frame( a = c(1, 2, 3, NA, 5), b = c(1, NA, 3, NA, 5), c = c("", "", "", "", ""), stringsAsFactors = FALSE ) empty_columns(tmp) } ================================================ FILE: man/replace_nan_inf.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/replace_nan_inf.R \name{replace_nan_inf} \alias{replace_nan_inf} \title{Convert infinite or \code{NaN} values into \code{NA}} \usage{ replace_nan_inf(x, ...) } \arguments{ \item{x}{A vector or a dataframe} \item{...}{Currently not used.} } \value{ Data with \code{Inf}, \code{-Inf}, and \code{NaN} converted to \code{NA}. } \description{ Replaces all infinite (\code{Inf} and \code{-Inf}) or \code{NaN} values with \code{NA}. } \examples{ # a vector x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7) replace_nan_inf(x) # a data frame df <- data.frame( x = c(1, NA, 5, Inf, 2, NA), y = c(3, NaN, 4, -Inf, 6, 7), stringsAsFactors = FALSE ) replace_nan_inf(df) } ================================================ FILE: man/rescale.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_rescale.R \name{rescale} \alias{rescale} \alias{change_scale} \alias{rescale.numeric} \alias{rescale.data.frame} \title{Rescale Variables to a New Range} \usage{ rescale(x, ...) change_scale(x, ...) \method{rescale}{numeric}( x, to = c(0, 100), multiply = NULL, add = NULL, range = NULL, verbose = TRUE, ... ) \method{rescale}{data.frame}( x, select = NULL, exclude = NULL, to = c(0, 100), multiply = NULL, add = NULL, range = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE, ... ) } \arguments{ \item{x}{A (grouped) data frame, numeric vector or factor.} \item{...}{Arguments passed to or from other methods.} \item{to}{Numeric vector of length 2 giving the new range that the variable will have after rescaling. To reverse-score a variable, the range should be given with the maximum value first. See examples.} \item{multiply}{If not \code{NULL}, \code{to} is ignored and \code{multiply} will be used, giving the factor by which the actual range of \code{x} should be expanded. For example, if a vector ranges from 5 to 15 and \code{multiply = 1.1}, the current range of 10 will be expanded by the factor of 1.1, giving a new range of 11. Thus, the rescaled vector would range from 4.5 to 15.5.} \item{add}{A vector of length 1 or 2. If not \code{NULL}, \code{to} is ignored and \code{add} will be used, giving the amount by which the minimum and maximum of the actual range of \code{x} should be expanded. For example, if a vector ranges from 5 to 15 and \code{add = 1}, the range will be expanded from 4 to 16. If \code{add} is of length 2, then the first value is used for the lower bound and the second value for the upper bound.} \item{range}{Initial (old) range of values. If \code{NULL}, will take the range of the input vector (\code{range(x)}).} \item{verbose}{Toggle warnings.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{append}{Logical or string. If \code{TRUE}, recoded or converted variables get new column names and are appended (column bind) to \code{x}, thus returning both the original and the recoded variables. The new columns get a suffix, based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for \code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for \code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be overwritten by their recoded versions. If a character value, recoded variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ A rescaled object. } \description{ Rescale variables to a new range. Can also be used to reverse-score variables (change the keying/scoring direction), or to expand a range. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ rescale(c(0, 1, 5, -5, -2)) rescale(c(0, 1, 5, -5, -2), to = c(-5, 5)) rescale(c(1, 2, 3, 4, 5), to = c(-2, 2)) # Specify the "theoretical" range of the input vector rescale(c(1, 3, 4), to = c(0, 40), range = c(0, 4)) # Reverse-score a variable rescale(c(1, 2, 3, 4, 5), to = c(5, 1)) rescale(c(1, 2, 3, 4, 5), to = c(2, -2)) # Data frames head(rescale(iris, to = c(0, 1))) head(rescale(iris, to = c(0, 1), select = "Sepal.Length")) # One can specify a list of ranges head(rescale(iris, to = list( "Sepal.Length" = c(0, 1), "Petal.Length" = c(-1, 0) ))) # "expand" ranges by a factor or a given value x <- 5:15 x # both will expand the range by 10\% rescale(x, multiply = 1.1) rescale(x, add = 0.5) # expand range by different values rescale(x, add = c(1, 3)) # Specify list of multipliers d <- data.frame(x = 5:15, y = 5:15) rescale(d, multiply = list(x = 1.1, y = 0.5)) } \seealso{ See \code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas. Other transform utilities: \code{\link{normalize}()}, \code{\link{ranktransform}()}, \code{\link{reverse}()}, \code{\link{standardize}()} } \concept{transform utilities} ================================================ FILE: man/rescale_weights.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rescale_weights.R \name{rescale_weights} \alias{rescale_weights} \title{Rescale design weights for multilevel analysis} \usage{ rescale_weights( data, probability_weights = NULL, by = NULL, nest = FALSE, method = "carle" ) } \arguments{ \item{data}{A data frame.} \item{probability_weights}{Variable indicating the probability (design or sampling) weights of the survey data (level-1-weight), provided as character string or formula.} \item{by}{Variable names (as character vector, or as formula), indicating the grouping structure (strata) of the survey data (level-2-cluster variable). It is also possible to create weights for multiple group variables; in such cases, each created weighting variable will be suffixed by the name of the group variable. This argument is required for \code{method = "carle"}, but optional for \code{method = "kish"}.} \item{nest}{Logical, if \code{TRUE} and \code{by} indicates at least two group variables, then groups are "nested", i.e. groups are now a combination from each group level of the variables in \code{by}. This argument is not used when \code{method = "kish"}.} \item{method}{String, indicating which rescale-method is used for rescaling weights. Can be either \code{"carle"} (default) or \code{"kish"}. See 'Details'. If \code{method = "carle"}, the \code{by} argument is required.} } \value{ \code{data}, including the new weighting variable(s). For \code{method = "carle"}, new columns \code{rescaled_weights_a} and \code{rescaled_weights_b} are returned, and for \code{method = "kish"}, the returned data contains a column \code{rescaled_weights}. These represent the rescaled design weights to use in multilevel models (use these variables for the \code{weights} argument). } \description{ Most functions to fit multilevel and mixed effects models only allow the user to specify frequency weights, but not design (i.e., sampling or probability) weights, which should be used when analyzing complex samples (e.g., probability samples). \code{rescale_weights()} implements two algorithms, one proposed by \cite{Asparouhov (2006)} and \cite{Carle (2009)}, to rescale design weights in survey data to account for the grouping structure of multilevel models, and one based on the design effect proposed by \cite{Kish (1965)}, to rescale weights by the design effect to account for additional sampling error introduced by weighting. } \details{ \itemize{ \item \code{method = "carle"} Rescaling is based on two methods: For \code{rescaled_weights_a}, the sample weights \code{probability_weights} are adjusted by a factor that represents the proportion of group size divided by the sum of sampling weights within each group. The adjustment factor for \code{rescaled_weights_b} is the sum of sample weights within each group divided by the sum of squared sample weights within each group (see Carle (2009), Appendix B). In other words, \code{rescaled_weights_a} "scales the weights so that the new weights sum to the cluster sample size" while \code{rescaled_weights_b} "scales the weights so that the new weights sum to the effective cluster size". Regarding the choice between scaling methods A and B, Carle suggests that "analysts who wish to discuss point estimates should report results based on weighting method A. For analysts more interested in residual between-group variance, method B may generally provide the least biased estimates". In general, it is recommended to fit a non-weighted model and weighted models with both scaling methods and when comparing the models, see whether the "inferential decisions converge", to gain confidence in the results. Though the bias of scaled weights decreases with increasing group size, method A is preferred when insufficient or low group size is a concern. The group ID and probably PSU may be used as random effects (e.g. nested design, or group and PSU as varying intercepts), depending on the survey design that should be mimicked. \item \code{method = "kish"} Rescaling is based on scaling the sample weights so the mean value is 1, which means the sum of all weights equals the sample size. Next, the design effect (\emph{Kish 1965}) is calculated, which is the mean of the squared weights divided by the squared mean of the weights. The scaled sample weights are then divided by the design effect. This method is most appropriate when weights are based on additional variables beyond the grouping variables in the model (e.g., other demographic characteristics), but may also be useful in other contexts. Some tests on real-world survey-data suggest that, in comparison to the Carle-method, the Kish-method comes closer to estimates from a regular survey-design using the \strong{survey} package. Note that these tests are not representative and it is recommended to check your results against a standard survey-design. } } \examples{ \dontshow{if (all(insight::check_if_installed(c("lme4", "parameters"), quietly = TRUE))) withAutoprint(\{ # examplesIf} data(nhanes_sample) head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")) # also works with multiple group-variables head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU"))) # or nested structures. x <- rescale_weights( data = nhanes_sample, probability_weights = "WTINT2YR", by = c("SDMVSTRA", "SDMVPSU"), nest = TRUE ) head(x) \donttest{ # compare different methods, using multilevel-Poisson regression d <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") result1 <- lme4::glmer( total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), family = poisson(), data = d, weights = rescaled_weights_a ) result2 <- lme4::glmer( total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), family = poisson(), data = d, weights = rescaled_weights_b ) d <- rescale_weights( nhanes_sample, "WTINT2YR", method = "kish" ) result3 <- lme4::glmer( total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), family = poisson(), data = d, weights = rescaled_weights ) d <- rescale_weights( nhanes_sample, "WTINT2YR", "SDMVSTRA", method = "kish" ) result4 <- lme4::glmer( total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), family = poisson(), data = d, weights = rescaled_weights ) parameters::compare_parameters( list(result1, result2, result3, result4), exponentiate = TRUE, column_names = c("Carle (A)", "Carle (B)", "Kish", "Kish (grouped)") ) } \dontshow{\}) # examplesIf} } \references{ \itemize{ \item Asparouhov T. (2006). General Multi-Level Modeling with Sampling Weights. Communications in Statistics - Theory and Methods 35: 439-460 \item Carle A.C. (2009). Fitting multilevel models in complex survey data with design weights: Recommendations. BMC Medical Research Methodology 9(49): 1-13 \item Kish, L. (1965) Survey Sampling. London: Wiley. } } ================================================ FILE: man/reshape_ci.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/reshape_ci.R \name{reshape_ci} \alias{reshape_ci} \title{Reshape CI between wide/long formats} \usage{ reshape_ci(x, ci_type = "CI") } \arguments{ \item{x}{A data frame containing columns named \code{CI_low} and \code{CI_high} (or similar, see \code{ci_type}).} \item{ci_type}{String indicating the "type" (i.e. prefix) of the interval columns. Per \emph{easystats} convention, confidence or credible intervals are named \code{CI_low} and \code{CI_high}, and the related \code{ci_type} would be \code{"CI"}. If column names for other intervals differ, \code{ci_type} can be used to indicate the name, e.g. \code{ci_type = "SI"} can be used for support intervals, where the column names in the data frame would be \code{SI_low} and \code{SI_high}.} } \value{ A data frame with columns corresponding to confidence intervals reshaped either to wide or long format. } \description{ Reshape CI between wide/long formats. } \examples{ x <- data.frame( Parameter = c("Term 1", "Term 2", "Term 1", "Term 2"), CI = c(0.8, 0.8, 0.9, 0.9), CI_low = c(0.2, 0.3, 0.1, 0.15), CI_high = c(0.5, 0.6, 0.8, 0.85), stringsAsFactors = FALSE ) reshape_ci(x) reshape_ci(reshape_ci(x)) } ================================================ FILE: man/reverse.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_reverse.R \name{reverse} \alias{reverse} \alias{reverse_scale} \alias{reverse.numeric} \alias{reverse.data.frame} \title{Reverse-Score Variables} \usage{ reverse(x, ...) reverse_scale(x, ...) \method{reverse}{numeric}(x, range = NULL, verbose = TRUE, ...) \method{reverse}{data.frame}( x, select = NULL, exclude = NULL, range = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = FALSE, ... ) } \arguments{ \item{x}{A (grouped) data frame, numeric vector or factor.} \item{...}{Arguments passed to or from other methods.} \item{range}{Range of values that is used as reference for reversing the scale. For numeric variables, can be \code{NULL} or a numeric vector of length two, indicating the lowest and highest value of the reference range. If \code{NULL}, will take the range of the input vector (\code{range(x)}). For factors, \code{range} can be \code{NULL}, a numeric vector of length two, or a (numeric) vector of at least the same length as factor levels (i.e. must be equal to or larger than \code{nlevels(x)}). Note that providing a \code{range} for factors usually only makes sense when factor levels are numeric, not characters.} \item{verbose}{Toggle warnings.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{append}{Logical or string. If \code{TRUE}, recoded or converted variables get new column names and are appended (column bind) to \code{x}, thus returning both the original and the recoded variables. The new columns get a suffix, based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for \code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for \code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be overwritten by their recoded versions. If a character value, recoded variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ A reverse-scored object. } \description{ Reverse-score variables (change the keying/scoring direction). } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ reverse(c(1, 2, 3, 4, 5)) reverse(c(-2, -1, 0, 2, 1)) # Specify the "theoretical" range of the input vector reverse(c(1, 3, 4), range = c(0, 4)) # Factor variables reverse(factor(c(1, 2, 3, 4, 5))) reverse(factor(c(1, 2, 3, 4, 5)), range = 0:10) # Data frames head(reverse(iris)) head(reverse(iris, select = "Sepal.Length")) } \seealso{ Other transform utilities: \code{\link{normalize}()}, \code{\link{ranktransform}()}, \code{\link{rescale}()}, \code{\link{standardize}()} } \concept{transform utilities} ================================================ FILE: man/row_count.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/row_count.R \name{row_count} \alias{row_count} \title{Count specific values row-wise} \usage{ row_count( data, select = NULL, exclude = NULL, count = NULL, allow_coercion = TRUE, ignore_case = FALSE, regex = FALSE, verbose = TRUE ) } \arguments{ \item{data}{A data frame with at least two columns, where number of specific values are counted row-wise.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{count}{The value for which the row sum should be computed. May be a numeric value, a character string (for factors or character vectors), \code{NA} or \code{Inf}.} \item{allow_coercion}{Logical. If \code{FALSE}, \code{count} matches only values of same class (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). By default, when \code{allow_coercion = TRUE}, \code{count = 2} also matches \code{"2"}. In order to count factor levels in the data, use \code{count = factor("level")}. See 'Examples'.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ A vector with row-wise counts of values specified in \code{count}. } \description{ \code{row_count()} mimics base R's \code{rowSums()}, with sums for a specific value indicated by \code{count}. Hence, it is similar to \code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including strict comparisons. Comparisons using \code{==} coerce values to atomic vectors, thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also possible to make "type safe" comparisons using the \code{allow_coercion} argument, where \code{"2" == 2} is not true. } \examples{ dat <- data.frame( c1 = c(1, 2, NA, 4), c2 = c(NA, 2, NA, 5), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, 8) ) # count all 4s per row row_count(dat, count = 4) # count all missing values per row row_count(dat, count = NA) dat <- data.frame( c1 = c("1", "2", NA, "3"), c2 = c(NA, "2", NA, "3"), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, Inf) ) # count all 2s and "2"s per row row_count(dat, count = 2) # only count 2s, but not "2"s row_count(dat, count = 2, allow_coercion = FALSE) dat <- data.frame( c1 = factor(c("1", "2", NA, "3")), c2 = c("2", "1", NA, "3"), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, Inf) ) # find only character "2"s row_count(dat, count = "2", allow_coercion = FALSE) # find only factor level "2"s row_count(dat, count = factor("2"), allow_coercion = FALSE) } ================================================ FILE: man/row_means.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/row_means.R \name{row_means} \alias{row_means} \alias{row_sums} \title{Row means or sums (optionally with minimum amount of valid values)} \usage{ row_means( data, select = NULL, exclude = NULL, min_valid = NULL, digits = NULL, ignore_case = FALSE, regex = FALSE, remove_na = FALSE, verbose = TRUE ) row_sums( data, select = NULL, exclude = NULL, min_valid = NULL, digits = NULL, ignore_case = FALSE, regex = FALSE, remove_na = FALSE, verbose = TRUE ) } \arguments{ \item{data}{A data frame with at least two columns, where row means or row sums are applied.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{min_valid}{Optional, a numeric value of length 1. May either be \itemize{ \item a numeric value that indicates the amount of valid values per row to calculate the row mean or row sum; \item or a value between \code{0} and \code{1}, indicating a proportion of valid values per row to calculate the row mean or row sum (see 'Details'). \item \code{NULL} (default), in which all cases are considered. } If a row's sum of valid values is less than \code{min_valid}, \code{NA} will be returned.} \item{digits}{Numeric value indicating the number of decimal places to be used for rounding mean values. Negative values are allowed (see 'Details'). By default, \code{digits = NULL} and no rounding is used.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) values before calculating row means or row sums. Only applies if \code{min_valid} is not specified.} \item{verbose}{Toggle warnings.} } \value{ A vector with row means (for \code{row_means()}) or row sums (for \code{row_sums()}) for those rows with at least \code{n} valid values. } \description{ This function is similar to the SPSS \code{MEAN.n} or \code{SUM.n} function and computes row means or row sums from a data frame or matrix if at least \code{min_valid} values of a row are valid (and not \code{NA}). } \details{ Rounding to a negative number of \code{digits} means rounding to a power of ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest hundred. For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value from \code{0} to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid} non-missing values, the row mean or row sum is returned. If \code{min_valid} is a non-integer value from 0 to 1, \code{min_valid} is considered to indicate the proportion of required non-missing values per row. E.g., if \code{min_valid = 0.75}, a row must have at least \code{ncol(data) * min_valid} non-missing values for the row mean or row sum to be calculated. See 'Examples'. } \examples{ dat <- data.frame( c1 = c(1, 2, NA, 4), c2 = c(NA, 2, NA, 5), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, 8) ) # default, all means are shown, if no NA values are present row_means(dat) # remove all NA before computing row means row_means(dat, remove_na = TRUE) # needs at least 4 non-missing values per row row_means(dat, min_valid = 4) # 1 valid return value row_sums(dat, min_valid = 4) # 1 valid return value # needs at least 3 non-missing values per row row_means(dat, min_valid = 3) # 2 valid return values # needs at least 2 non-missing values per row row_means(dat, min_valid = 2) # needs at least 1 non-missing value per row, for two selected variables row_means(dat, select = c("c1", "c3"), min_valid = 1) # needs at least 50\% of non-missing values per row row_means(dat, min_valid = 0.5) # 3 valid return values row_sums(dat, min_valid = 0.5) # needs at least 75\% of non-missing values per row row_means(dat, min_valid = 0.75) # 2 valid return values } ================================================ FILE: man/rownames.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-rows.R \name{rownames_as_column} \alias{rownames_as_column} \alias{column_as_rownames} \alias{rowid_as_column} \title{Tools for working with row names or row ids} \usage{ rownames_as_column(x, var = "rowname") column_as_rownames(x, var = "rowname") rowid_as_column(x, var = "rowid") } \arguments{ \item{x}{A data frame.} \item{var}{Name of column to use for row names/ids. For \code{column_as_rownames()}, this argument can be the variable name or the column number. For \code{rownames_as_column()} and \code{rowid_as_column()}, the column name must not already exist in the data.} } \value{ A data frame. } \description{ Tools for working with row names or row ids } \details{ These are similar to \code{tibble}'s functions \code{column_to_rownames()}, \code{rownames_to_column()} and \code{rowid_to_column()}. Note that the behavior of \code{rowid_as_column()} is different for grouped dataframe: instead of making the rowid unique across the full dataframe, it creates rowid per group. Therefore, there can be several rows with the same rowid if they belong to different groups. If you are familiar with \code{dplyr}, this is similar to doing the following: \if{html}{\out{
}}\preformatted{data |> group_by(grp) |> mutate(id = row_number()) |> ungroup() }\if{html}{\out{
}} } \examples{ # Convert between row names and column -------------------------------- test <- rownames_as_column(mtcars, var = "car") test head(column_as_rownames(test, var = "car")) test_data <- head(iris) rowid_as_column(test_data) rowid_as_column(test_data, var = "my_id") } ================================================ FILE: man/skewness.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/skewness_kurtosis.R \name{skewness} \alias{skewness} \alias{skewness.numeric} \alias{kurtosis} \alias{kurtosis.numeric} \alias{print.parameters_kurtosis} \alias{print.parameters_skewness} \alias{summary.parameters_skewness} \alias{summary.parameters_kurtosis} \title{Compute Skewness and (Excess) Kurtosis} \usage{ skewness(x, ...) \method{skewness}{numeric}( x, remove_na = TRUE, type = "2", iterations = NULL, verbose = TRUE, ... ) kurtosis(x, ...) \method{kurtosis}{numeric}( x, remove_na = TRUE, type = "2", iterations = NULL, verbose = TRUE, ... ) \method{print}{parameters_kurtosis}(x, digits = 3, test = FALSE, ...) \method{print}{parameters_skewness}(x, digits = 3, test = FALSE, ...) \method{summary}{parameters_skewness}(object, test = FALSE, ...) \method{summary}{parameters_kurtosis}(object, test = FALSE, ...) } \arguments{ \item{x}{A numeric vector or data.frame.} \item{...}{Arguments passed to or from other methods.} \item{remove_na}{Logical. Should \code{NA} values be removed before computing (\code{TRUE}) or not (\code{FALSE}, default)?} \item{type}{Type of algorithm for computing skewness. May be one of \code{1} (or \code{"1"}, \code{"I"} or \code{"classic"}), \code{2} (or \code{"2"}, \code{"II"} or \code{"SPSS"} or \code{"SAS"}) or \code{3} (or \code{"3"}, \code{"III"} or \code{"Minitab"}). See 'Details'.} \item{iterations}{The number of bootstrap replicates for computing standard errors. If \code{NULL} (default), parametric standard errors are computed.} \item{verbose}{Toggle warnings and messages.} \item{digits}{Number of decimal places.} \item{test}{Logical, if \code{TRUE}, tests if skewness or kurtosis is significantly different from zero.} \item{object}{An object returned by \code{skewness()} or \code{kurtosis()}.} } \value{ Values of skewness or kurtosis. } \description{ Compute Skewness and (Excess) Kurtosis } \details{ \subsection{Skewness}{ Symmetric distributions have a \code{skewness} around zero, while a negative skewness values indicates a "left-skewed" distribution, and a positive skewness values indicates a "right-skewed" distribution. Examples for the relationship of skewness and distributions are: \itemize{ \item Normal distribution (and other symmetric distribution) has a skewness of 0 \item Half-normal distribution has a skewness just below 1 \item Exponential distribution has a skewness of 2 \item Lognormal distribution can have a skewness of any positive value, depending on its parameters } (\cite{https://en.wikipedia.org/wiki/Skewness}) } \subsection{Types of Skewness}{ \code{skewness()} supports three different methods for estimating skewness, as discussed in \cite{Joanes and Gill (1988)}: \itemize{ \item Type "1" is the "classical" method, which is \code{g1 = (sum((x - mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5} \item Type "2" first calculates the type-1 skewness, then adjusts the result: \code{G1 = g1 * sqrt(n * (n - 1)) / (n - 2)}. This is what SAS and SPSS usually return. \item Type "3" first calculates the type-1 skewness, then adjusts the result: \code{b1 = g1 * ((1 - 1 / n))^1.5}. This is what Minitab usually returns. } } \subsection{Kurtosis}{ The \code{kurtosis} is a measure of "tailedness" of a distribution. A distribution with a kurtosis values of about zero is called "mesokurtic". A kurtosis value larger than zero indicates a "leptokurtic" distribution with \emph{fatter} tails. A kurtosis value below zero indicates a "platykurtic" distribution with \emph{thinner} tails (\cite{https://en.wikipedia.org/wiki/Kurtosis}). } \subsection{Types of Kurtosis}{ \code{kurtosis()} supports three different methods for estimating kurtosis, as discussed in \cite{Joanes and Gill (1988)}: \itemize{ \item Type "1" is the "classical" method, which is \code{g2 = n * sum((x - mean(x))^4) / (sum((x - mean(x))^2)^2) - 3}. \item Type "2" first calculates the type-1 kurtosis, then adjusts the result: \code{G2 = ((n + 1) * g2 + 6) * (n - 1)/((n - 2) * (n - 3))}. This is what SAS and SPSS usually return \item Type "3" first calculates the type-1 kurtosis, then adjusts the result: \code{b2 = (g2 + 3) * (1 - 1 / n)^2 - 3}. This is what Minitab usually returns. } } \subsection{Standard Errors}{ It is recommended to compute empirical (bootstrapped) standard errors (via the \code{iterations} argument) than relying on analytic standard errors (\cite{Wright & Herrington, 2011}). } } \examples{ skewness(rnorm(1000)) kurtosis(rnorm(1000)) } \references{ \itemize{ \item D. N. Joanes and C. A. Gill (1998). Comparing measures of sample skewness and kurtosis. The Statistician, 47, 183–189. \item Wright, D. B., & Herrington, J. A. (2011). Problematic standard errors and confidence intervals for skewness and kurtosis. Behavior research methods, 43(1), 8-17. } } ================================================ FILE: man/slide.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/slide.R \name{slide} \alias{slide} \alias{slide.numeric} \alias{slide.data.frame} \title{Shift numeric value range} \usage{ slide(x, ...) \method{slide}{numeric}(x, lowest = 0, ...) \method{slide}{data.frame}( x, select = NULL, exclude = NULL, lowest = 0, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A data frame or numeric vector.} \item{...}{not used.} \item{lowest}{Numeric, indicating the lowest (minimum) value when converting factors or character vectors to numeric values.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{append}{Logical or string. If \code{TRUE}, recoded or converted variables get new column names and are appended (column bind) to \code{x}, thus returning both the original and the recoded variables. The new columns get a suffix, based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for \code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for \code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be overwritten by their recoded versions. If a character value, recoded variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ \code{x}, where the range of numeric variables starts at a new value. } \description{ This functions shifts the value range of a numeric variable, so that the new range starts at a given value. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ # numeric head(mtcars$gear) head(slide(mtcars$gear)) head(slide(mtcars$gear, lowest = 10)) # data frame sapply(slide(mtcars, lowest = 1), min) sapply(mtcars, min) } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: man/smoothness.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/smoothness.R \name{smoothness} \alias{smoothness} \title{Quantify the smoothness of a vector} \usage{ smoothness(x, method = "cor", lag = 1, iterations = NULL, ...) } \arguments{ \item{x}{Numeric vector (similar to a time series).} \item{method}{Can be \code{"diff"} (the standard deviation of the standardized differences) or \code{"cor"} (default, lag-one autocorrelation).} \item{lag}{An integer indicating which lag to use. If less than \code{1}, will be interpreted as expressed in percentage of the length of the vector.} \item{iterations}{The number of bootstrap replicates for computing standard errors. If \code{NULL} (default), parametric standard errors are computed.} \item{...}{Arguments passed to or from other methods.} } \value{ Value of smoothness. } \description{ Quantify the smoothness of a vector } \examples{ x <- (-10:10)^3 + rnorm(21, 0, 100) plot(x) smoothness(x, method = "cor") smoothness(x, method = "diff") } \references{ https://stats.stackexchange.com/questions/24607/how-to-measure-smoothness-of-a-time-series-in-r } ================================================ FILE: man/standardize.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/standardize.R, R/unstandardize.R \name{standardize} \alias{standardize} \alias{standardise} \alias{standardize.numeric} \alias{standardize.factor} \alias{standardize.data.frame} \alias{unstandardize} \alias{unstandardise} \alias{unstandardize.numeric} \alias{unstandardize.data.frame} \title{Standardization (Z-scoring)} \usage{ standardize(x, ...) standardise(x, ...) \method{standardize}{numeric}( x, robust = FALSE, two_sd = FALSE, weights = NULL, reference = NULL, center = NULL, scale = NULL, verbose = TRUE, ... ) \method{standardize}{factor}( x, robust = FALSE, two_sd = FALSE, weights = NULL, force = FALSE, verbose = TRUE, ... ) \method{standardize}{data.frame}( x, select = NULL, exclude = NULL, robust = FALSE, two_sd = FALSE, weights = NULL, reference = NULL, center = NULL, scale = NULL, remove_na = c("none", "selected", "all"), force = FALSE, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) unstandardize(x, ...) unstandardise(x, ...) \method{unstandardize}{numeric}( x, center = NULL, scale = NULL, reference = NULL, robust = FALSE, two_sd = FALSE, ... ) \method{unstandardize}{data.frame}( x, center = NULL, scale = NULL, reference = NULL, robust = FALSE, two_sd = FALSE, select = NULL, exclude = NULL, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A (grouped) data frame, a vector or a statistical model (for \code{unstandardize()} cannot be a model).} \item{...}{Arguments passed to or from other methods.} \item{robust}{Logical, if \code{TRUE}, centering is done by subtracting the median from the variables and dividing it by the median absolute deviation (MAD). If \code{FALSE}, variables are standardized by subtracting the mean and dividing it by the standard deviation (SD).} \item{two_sd}{If \code{TRUE}, the variables are scaled by two times the deviation (SD or MAD depending on \code{robust}). This method can be useful to obtain model coefficients of continuous parameters comparable to coefficients related to binary predictors, when applied to \strong{the predictors} (not the outcome) (Gelman, 2008).} \item{weights}{Can be \code{NULL} (for no weighting), or: \itemize{ \item For model: if \code{TRUE} (default), a weighted-standardization is carried out. \item For \code{data.frame}s: a numeric vector of weights, or a character of the name of a column in the \code{data.frame} that contains the weights. \item For numeric vectors: a numeric vector of weights. }} \item{reference}{A data frame or variable from which the centrality and deviation will be computed instead of from the input variable. Useful for standardizing a subset or new data according to another data frame.} \item{center, scale}{\itemize{ \item For \code{standardize()}: \cr Numeric values, which can be used as alternative to \code{reference} to define a reference centrality and deviation. If \code{scale} and \code{center} are of length 1, they will be recycled to match the length of selected variables for standardization. Else, \code{center} and \code{scale} must be of same length as the number of selected variables. Values in \code{center} and \code{scale} will be matched to selected variables in the provided order, unless a named vector is given. In this case, names are matched against the names of the selected variables. \item For \code{unstandardize()}: \cr \code{center} and \code{scale} correspond to the center (the mean / median) and the scale (SD / MAD) of the original non-standardized data (for data frames, should be named, or have column order correspond to the numeric column). However, one can also directly provide the original data through \code{reference}, from which the center and the scale will be computed (according to \code{robust} and \code{two_sd}). Alternatively, if the input contains the attributes \code{center} and \code{scale} (as does the output of \code{standardize()}), it will take it from there if the rest of the arguments are absent. }} \item{verbose}{Toggle warnings and messages on or off.} \item{force}{Logical, if \code{TRUE}, forces recoding of factors and character vectors as well.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{remove_na}{How should missing values (\code{NA}) be treated: if \code{"none"} (default): each column's standardization is done separately, ignoring \code{NA}s. Else, rows with \code{NA} in the columns selected with \code{select} / \code{exclude} (\code{"selected"}) or in all columns (\code{"all"}) are dropped before standardization, and the resulting data frame does not include these cases.} \item{append}{Logical or string. If \code{TRUE}, standardized variables get new column names (with the suffix \code{"_z"}) and are appended (column bind) to \code{x}, thus returning both the original and the standardized variables. If \code{FALSE}, original variables in \code{x} will be overwritten by their standardized versions. If a character value, standardized variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ The standardized object (either a standardize data frame or a statistical model fitted on standardized data). } \description{ Performs a standardization of data (z-scoring), i.e., centering and scaling, so that the data is expressed in terms of standard deviation (i.e., mean = 0, SD = 1) or Median Absolute Deviance (median = 0, MAD = 1). When applied to a statistical model, this function extracts the dataset, standardizes it, and refits the model with this standardized version of the dataset. The \code{\link[=normalize]{normalize()}} function can also be used to scale all numeric variables within the 0 - 1 range. \cr\cr For model standardization, see \code{\link[=standardize.default]{standardize.default()}}. } \note{ When \code{x} is a vector or a data frame with \verb{remove_na = "none")}, missing values are preserved, so the return value has the same length / number of rows as the original input. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ d <- iris[1:4, ] # vectors standardise(d$Petal.Length) # Data frames # overwrite standardise(d, select = c("Sepal.Length", "Sepal.Width")) # append standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = TRUE) # append, suffix standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = "_std") # standardizing with reference center and scale d <- data.frame( a = c(-2, -1, 0, 1, 2), b = c(3, 4, 5, 6, 7) ) # default standardization, based on mean and sd of each variable standardize(d) # means are 0 and 5, sd ~ 1.581139 # standardization, based on mean and sd set to the same values standardize(d, center = c(0, 5), scale = c(1.581, 1.581)) # standardization, mean and sd for each variable newly defined standardize(d, center = c(3, 4), scale = c(2, 4)) # standardization, taking same mean and sd for each variable standardize(d, center = 1, scale = 3) } \seealso{ See \code{\link[=center]{center()}} for grand-mean centering of variables, and \code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas. Other transform utilities: \code{\link{normalize}()}, \code{\link{ranktransform}()}, \code{\link{rescale}()}, \code{\link{reverse}()} Other standardize: \code{\link{standardize.default}()} } \concept{standardize} \concept{transform utilities} ================================================ FILE: man/standardize.default.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/standardize.models.R \name{standardize.default} \alias{standardize.default} \alias{standardize_models} \title{Re-fit a model with standardized data} \usage{ \method{standardize}{default}( x, robust = FALSE, two_sd = FALSE, weights = TRUE, verbose = TRUE, include_response = TRUE, ... ) } \arguments{ \item{x}{A statistical model.} \item{robust}{Logical, if \code{TRUE}, centering is done by subtracting the median from the variables and dividing it by the median absolute deviation (MAD). If \code{FALSE}, variables are standardized by subtracting the mean and dividing it by the standard deviation (SD).} \item{two_sd}{If \code{TRUE}, the variables are scaled by two times the deviation (SD or MAD depending on \code{robust}). This method can be useful to obtain model coefficients of continuous parameters comparable to coefficients related to binary predictors, when applied to \strong{the predictors} (not the outcome) (Gelman, 2008).} \item{weights}{If \code{TRUE} (default), a weighted-standardization is carried out.} \item{verbose}{Toggle warnings and messages on or off.} \item{include_response}{If \code{TRUE} (default), the response value will also be standardized. If \code{FALSE}, only the predictors will be standardized. \itemize{ \item Note that for GLMs and models with non-linear link functions, the response value will not be standardized, to make re-fitting the model work. \item If the model contains an \code{\link[stats:offset]{stats::offset()}}, the offset variable(s) will be standardized only if the response is standardized. If \code{two_sd = TRUE}, offsets are standardized by one-sd (similar to the response). \item (For \code{mediate} models, the \code{include_response} refers to the outcome in the y model; m model's response will always be standardized when possible). }} \item{...}{Arguments passed to or from other methods.} } \value{ A statistical model fitted on standardized data } \description{ Performs a standardization of data (z-scoring) using \code{\link[=standardize]{standardize()}} and then re-fits the model to the standardized data. \cr\cr Standardization is done by completely refitting the model on the standardized data. Hence, this approach is equal to standardizing the variables \emph{before} fitting the model and will return a new model object. This method is particularly recommended for complex models that include interactions or transformations (e.g., polynomial or spline terms). The \code{robust} (default to \code{FALSE}) argument enables a robust standardization of data, based on the \code{median} and the \code{MAD} instead of the \code{mean} and the \code{SD}. } \section{Generalized Linear Models}{ Standardization for generalized linear models (GLM, GLMM, etc) is done only with respect to the predictors (while the outcome remains as-is, unstandardized) - maintaining the interpretability of the coefficients (e.g., in a binomial model: the exponent of the standardized parameter is the OR of a change of 1 SD in the predictor, etc.) } \section{Dealing with Factors}{ \code{standardize(model)} or \code{standardize_parameters(model, method = "refit")} do \emph{not} standardize categorical predictors (i.e. factors) / their dummy-variables, which may be a different behaviour compared to other R packages (such as \strong{lm.beta}) or other software packages (like SPSS). To mimic such behaviours, either use \code{standardize_parameters(model, method = "basic")} to obtain post-hoc standardized parameters, or standardize the data with \code{standardize(data, force = TRUE)} \emph{before} fitting the model. } \section{Transformed Variables}{ When the model's formula contains transformations (e.g. \code{y ~ exp(X)}) the transformation effectively takes place after standardization (e.g., \code{exp(scale(X))}). Since some transformations are undefined for none positive values, such as \code{log()} and \code{sqrt()}, the relevel variables are shifted (post standardization) by \code{Z - min(Z) + 1} or \code{Z - min(Z)} (respectively). } \examples{ model <- lm(Infant.Mortality ~ Education * Fertility, data = swiss) coef(standardize(model)) } \seealso{ Other standardize: \code{\link{standardize}()} } \concept{standardize} ================================================ FILE: man/text_format.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/text_format.R \name{text_format} \alias{text_format} \alias{text_fullstop} \alias{text_lastchar} \alias{text_concatenate} \alias{text_paste} \alias{text_remove} \alias{text_wrap} \title{Convenient text formatting functionalities} \usage{ text_format( text, sep = ", ", last = " and ", width = NULL, enclose = NULL, ... ) text_fullstop(text) text_lastchar(text, n = 1) text_concatenate(text, sep = ", ", last = " and ", enclose = NULL) text_paste(text, text2 = NULL, sep = ", ", enclose = NULL, ...) text_remove(text, pattern = "", ...) text_wrap(text, width = NULL, ...) } \arguments{ \item{text, text2}{A character string.} \item{sep}{Separator.} \item{last}{Last separator.} \item{width}{Positive integer giving the target column width for wrapping lines in the output. Can be "auto", in which case it will select 90\\% of the default width.} \item{enclose}{Character that will be used to wrap elements of \code{text}, so these can be, e.g., enclosed with quotes or backticks. If \code{NULL} (default), text elements will not be enclosed.} \item{...}{Other arguments to be passed to or from other functions.} \item{n}{The number of characters to find.} \item{pattern}{Regex pattern to remove from \code{text}.} } \value{ A character string. } \description{ Convenience functions to manipulate and format text. } \examples{ # Add full stop if missing text_fullstop(c("something", "something else.")) # Find last characters text_lastchar(c("ABC", "DEF"), n = 2) # Smart concatenation text_concatenate(c("First", "Second", "Last")) text_concatenate(c("First", "Second", "Last"), last = " or ", enclose = "`") # Remove parts of string text_remove(c("one!", "two", "three!"), "!") # Wrap text long_text <- paste(rep("abc ", 100), collapse = "") cat(text_wrap(long_text, width = 50)) # Paste with optional separator text_paste(c("A", "", "B"), c("42", "42", "42")) } ================================================ FILE: man/to_factor.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/to_factor.R \name{to_factor} \alias{to_factor} \alias{to_factor.numeric} \alias{to_factor.data.frame} \title{Convert data to factors} \usage{ to_factor(x, ...) \method{to_factor}{numeric}(x, labels_to_levels = TRUE, verbose = TRUE, ...) \method{to_factor}{data.frame}( x, select = NULL, exclude = NULL, ignore_case = FALSE, append = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A data frame or vector.} \item{...}{Arguments passed to or from other methods.} \item{labels_to_levels}{Logical, if \code{TRUE}, value labels are used as factor levels after \code{x} was converted to factor. Else, factor levels are based on the values of \code{x} (i.e. as if using \code{as.factor()}).} \item{verbose}{Toggle warnings.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{append}{Logical or string. If \code{TRUE}, recoded or converted variables get new column names and are appended (column bind) to \code{x}, thus returning both the original and the recoded variables. The new columns get a suffix, based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for \code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for \code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be overwritten by their recoded versions. If a character value, recoded variables are appended with new column names (using the defined suffix) to the original data frame.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} } \value{ A factor, or a data frame of factors. } \description{ Convert data to factors } \details{ Convert variables or data into factors. If the data is labelled, value labels will be used as factor levels. The counterpart to convert variables into numeric is \code{to_numeric()}. } \note{ Factors are ignored and returned as is. If you want to use value labels as levels for factors, use \code{\link[=labels_to_levels]{labels_to_levels()}} instead. } \section{Selection of variables - the \code{select} argument}{ For most functions that have a \code{select} argument (including this function), the complete input data frame is returned, even when \code{select} only selects a range of variables. That is, the function is only applied to those variables that have a match in \code{select}, while all other variables remain unchanged. In other words: for this function, \code{select} will not omit any non-included variables, so that the returned data frame will include all variables from the input data frame. } \examples{ str(to_factor(iris)) # use labels as levels data(efc) str(efc$c172code) head(to_factor(efc$c172code)) } ================================================ FILE: man/to_numeric.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/to_numeric.R \name{to_numeric} \alias{to_numeric} \alias{to_numeric.data.frame} \title{Convert data to numeric} \usage{ to_numeric(x, ...) \method{to_numeric}{data.frame}( x, select = NULL, exclude = NULL, dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, append = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE, ... ) } \arguments{ \item{x}{A data frame, factor or vector.} \item{...}{Arguments passed to or from other methods.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), \item a string with the variable name (e.g., \code{"column_name"}), a character vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a character vector of variable names including ranges specified via \code{:} (e.g., \code{c("col1:col3", "col5")}), \item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can be a named character vector. In this case, the names are used to rename the columns in the output data frame. See 'Details' in the related functions to see where this option applies. \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), \item a vector of negative integers, giving the positions counting from the right (e.g., \code{-1} or \code{-1:-3}), \item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, \code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, \code{ends_with()}, and \code{contains()} accept several patterns, e.g \code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular expression patterns. \item a function testing for logical conditions, e.g. \code{is.numeric()} (or \code{is.numeric}), or any user-defined function that selects the variables for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), \item ranges specified via literal variable names, select-helpers (except \code{regex()}) and (user-defined) functions can be negated, i.e. return non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, \code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means that matches are \emph{excluded}, and thus, the \code{exclude} argument can be used alternatively. For instance, \code{select=-ends_with("Length")} (with \code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case negation should not work as expected, use the \code{exclude} argument instead. } If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} \item{exclude}{See \code{select}, however, column names matched by the pattern from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), excludes no columns.} \item{dummy_factors}{Transform factors to dummy factors (all factor levels as different columns filled with a binary 0-1 value).} \item{preserve_levels}{Logical, only applies if \code{x} is a factor. If \code{TRUE}, and \code{x} has numeric factor levels, these will be converted into the related numeric values. If this is not possible, the converted numeric values will start from 1 to number of levels.} \item{lowest}{Numeric, indicating the lowest (minimum) value when converting factors or character vectors to numeric values.} \item{append}{Logical or string. If \code{TRUE}, recoded or converted variables get new column names and are appended (column bind) to \code{x}, thus returning both the original and the recoded variables. The new columns get a suffix, based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for \code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for \code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be overwritten by their recoded versions. If a character value, recoded variables are appended with new column names (using the defined suffix) to the original data frame.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} \item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a character string (or a variable containing a character string) and is not allowed to be one of the supported select-helpers or a character vector of length > 1. \code{regex = TRUE} is comparable to using one of the two select-helpers, \code{select = contains()} or \code{select = regex()}, however, since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{verbose}{Toggle warnings.} } \value{ A data frame of numeric variables. } \description{ Convert data to numeric by converting characters to factors and factors to either numeric levels or dummy variables. The "counterpart" to convert variables into factors is \code{to_factor()}. } \note{ When factors should be converted into multiple "binary" dummies, i.e. each factor level is converted into a separate column filled with a binary 0-1 value, set \code{dummy_factors = TRUE}. If you want to preserve the original factor levels (in case these represent numeric values), use \code{preserve_levels = TRUE}. } \section{Selection of variables - \code{select} argument}{ For most functions that have a \code{select} argument the complete input data frame is returned, even when \code{select} only selects a range of variables. However, for \code{to_numeric()}, factors might be converted into dummies, thus, the number of variables of the returned data frame no longer match the input data frame. Hence, when \code{select} is used, \emph{only} those variables (or their dummies) specified in \code{select} will be returned. Use \code{append=TRUE} to also include the original variables in the returned data frame. } \examples{ to_numeric(head(ToothGrowth)) to_numeric(head(ToothGrowth), dummy_factors = TRUE) # factors x <- as.factor(mtcars$gear) to_numeric(x) to_numeric(x, preserve_levels = TRUE) # same as: coerce_to_numeric(x) } ================================================ FILE: man/visualisation_recipe.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/visualisation_recipe.R \name{visualisation_recipe} \alias{visualisation_recipe} \title{Prepare objects for visualisation} \usage{ visualisation_recipe(x, ...) } \arguments{ \item{x}{An \code{easystats} object.} \item{...}{Other arguments passed to other functions.} } \description{ This function prepares objects for visualisation by returning a list of layers with data and geoms that can be easily plotted using for instance \code{ggplot2}. If the \code{see} package is installed, the call to \code{visualization_recipe()} can be replaced by \code{plot()}, which will internally call the former and then plot it using \code{ggplot}. The resulting plot can be customized ad-hoc (by adding ggplot's geoms, theme or specifications), or via some of the arguments of \code{visualisation_recipe()} that control the aesthetic parameters. See the specific documentation page for your object's class: \itemize{ \item {modelbased}: \url{https://easystats.github.io/modelbased/reference/visualisation_recipe.estimate_predicted.html} \item {correlation}: \url{https://easystats.github.io/correlation/reference/visualisation_recipe.easycormatrix.html} } } ================================================ FILE: man/weighted_mean.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/weighted_mean_median_sd_mad.R \name{weighted_mean} \alias{weighted_mean} \alias{weighted_median} \alias{weighted_sd} \alias{weighted_mad} \title{Weighted Mean, Median, SD, and MAD} \usage{ weighted_mean(x, weights = NULL, remove_na = TRUE, verbose = TRUE, ...) weighted_median(x, weights = NULL, remove_na = TRUE, verbose = TRUE, ...) weighted_sd(x, weights = NULL, remove_na = TRUE, verbose = TRUE, ...) weighted_mad( x, weights = NULL, constant = 1.4826, remove_na = TRUE, verbose = TRUE, ... ) } \arguments{ \item{x}{an object containing the values whose weighted mean is to be computed.} \item{weights}{A numerical vector of weights the same length as \code{x} giving the weights to use for elements of \code{x}. If \code{weights = NULL}, \code{x} is passed to the non-weighted function.} \item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) and infinite values from \code{x} and \code{weights}.} \item{verbose}{Show warning when \code{weights} are negative?} \item{...}{arguments to be passed to or from methods.} \item{constant}{scale factor.} } \description{ Weighted Mean, Median, SD, and MAD } \examples{ ## GPA from Siegel 1994 x <- c(3.7, 3.3, 3.5, 2.8) wt <- c(5, 5, 4, 1) / 15 weighted_mean(x, wt) weighted_median(x, wt) weighted_sd(x, wt) weighted_mad(x, wt) } ================================================ FILE: man/winsorize.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/winsorize.R \name{winsorize} \alias{winsorize} \alias{winsorize.numeric} \title{Winsorize data} \usage{ winsorize(data, ...) \method{winsorize}{numeric}( data, threshold = 0.2, method = "percentile", robust = FALSE, verbose = TRUE, ... ) } \arguments{ \item{data}{data frame or vector.} \item{...}{Currently not used.} \item{threshold}{The amount of winsorization, depends on the value of \code{method}: \itemize{ \item For \code{method = "percentile"}: the amount to winsorize from \emph{each} tail. The value of \code{threshold} must be between 0 and 0.5 and of length 1. \item For \code{method = "zscore"}: the number of \emph{SD}/\emph{MAD}-deviations from the \emph{mean}/\emph{median} (see \code{robust}). The value of \code{threshold} must be greater than 0 and of length 1. \item For \code{method = "raw"}: a vector of length 2 with the lower and upper bound for winsorization. }} \item{method}{One of "percentile" (default), "zscore", or "raw".} \item{robust}{Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.} \item{verbose}{Not used anymore since \code{datawizard} 0.6.6.} } \value{ A data frame with winsorized columns or a winsorized vector. } \description{ Winsorize data } \details{ Winsorizing or winsorization is the transformation of statistics by limiting extreme values in the statistical data to reduce the effect of possibly spurious outliers. The distribution of many statistics can be heavily influenced by outliers. A typical strategy is to set all outliers (values beyond a certain threshold) to a specified percentile of the data; for example, a \verb{90\%} winsorization would see all data below the 5th percentile set to the 5th percentile, and data above the 95th percentile set to the 95th percentile. Winsorized estimators are usually more robust to outliers than their more standard forms. } \examples{ hist(iris$Sepal.Length, main = "Original data") hist(winsorize(iris$Sepal.Length, threshold = 0.2), xlim = c(4, 8), main = "Percentile Winsorization" ) hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), xlim = c(4, 8), main = "Mean (+/- SD) Winsorization" ) hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), xlim = c(4, 8), main = "Median (+/- MAD) Winsorization" ) hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), xlim = c(4, 8), main = "Raw Thresholds" ) # Also works on a data frame: winsorize(iris, threshold = 0.2) } \seealso{ \itemize{ \item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}} \item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}}, \code{\link[=data_remove]{data_remove()}} \item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}}, \code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}} \item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}}, \code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}} \item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}} \item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}} \item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}} \item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}} } } ================================================ FILE: paper/JOSS_files/apa.csl ================================================ ================================================ FILE: paper/JOSS_files/paper.Rmd ================================================ --- title: "datawizard: An R Package for Easy Data Preparation and Statistical Transformations" tags: - R - easystats authors: - affiliation: 1 name: Indrajeet Patil orcid: 0000-0003-1995-6531 - affiliation: 2 name: Dominique Makowski orcid: 0000-0001-5375-9967 - affiliation: 3 name: Mattan S. Ben-Shachar orcid: 0000-0002-4287-4801 - affiliation: 4 name: Brenton M. Wiernik^[Brenton Wiernik is currently an independent researcher and Research Scientist at Meta, Demography and Survey Science. The current work was done in an independent capacity.] orcid: 0000-0001-9560-6336 - affiliation: 5 name: Etienne Bacher orcid: 0000-0002-9271-5075 - affiliation: 6 name: Daniel Lüdecke orcid: 0000-0002-8895-3206 affiliations: - index: 1 name: cynkra Analytics GmbH, Germany - index: 2 name: Nanyang Technological University, Singapore - index: 3 name: Ben-Gurion University of the Negev, Israel - index: 4 name: Independent Researcher - index: 5 name: Luxembourg Institute of Socio-Economic Research (LISER), Luxembourg - index: 6 name: University Medical Center Hamburg-Eppendorf, Germany date: "`r Sys.Date()`" bibliography: paper.bib output: rticles::joss_article csl: apa.csl journal: JOSS link-citations: yes --- ```{r, warning=FALSE, message=FALSE, echo=FALSE} knitr::opts_chunk$set( collapse = TRUE, out.width = "100%", dpi = 300, comment = "#>", message = FALSE, warning = FALSE ) library(datawizard) set.seed(2016) ``` # Summary The `{datawizard}` package for the R programming language [@base2021] provides a lightweight toolbox to assist in key steps involved in any data analysis workflow: (1) wrangling the raw data to get it in the needed form, (2) applying preprocessing steps and statistical transformations, and (3) compute statistical summaries of data properties and distributions. Therefore, it can be a valuable tool for R users and developers looking for a lightweight option for data preparation. # Statement of Need The `{datawizard}` package is part of `{easystats}`, a collection of R packages designed to make statistical analysis easier (@Ben-Shachar2020, @Lüdecke2020parameters, @Lüdecke2020performance, @Lüdecke2021see, @Lüdecke2019, @Makowski2019, @Makowski2020). As this ecosystem follows a "0-external-hard-dependency" policy, a data manipulation package that relies only on base R needed to be created. In effect, `{datawizard}` provides a data processing backend for this entire ecosystem. In addition to its usefulness to the `{easystats}` ecosystem, it also provides *an* option for R users and package developers if they wish to keep their (recursive) dependency weight to a minimum (for other options, see @Dowle2021, @Eastwood2021). Because `{datawizard}` is also meant to be used and adopted easily by a wide range of users, its workflow and syntax are designed to be similar to `{tidyverse}` [@Wickham2019], a widely used ecosystem of R packages. Thus, users familiar with the `{tidyverse}` can easily translate their knowledge and make full use of `{datawizard}`. In addition to being a lightweight solution to clean messy data, `{datawizard}` also provides helpers for the other important step of data analysis: applying statistical transformations to the cleaned data while setting up statistical models. This includes various types of data standardization, normalization, rank-transformation, and adjustment. These transformations, although widely used, are not currently collectively implemented in a package in the R ecosystem, so `{datawizard}` can help new R users in finding the transformation they need. Lastly, `{datawizard}` also provides a toolbox to create detailed summaries of data properties and distributions (e.g., tables of descriptive statistics for each variable). This is a common step in data analysis, but it is not available in base R or many modeling packages, so its inclusion makes `{datawizard}` a one-stop-shop for data preparation tasks. # Features ## Data Preparation The raw data is rarely in a state that it can be directly fed into a statistical model. It often needs to be modified in various ways. For example, columns need to be renamed or reshaped, certain portions of the data need to be filtered out, data scattered across multiple tables needs to be joined, etc. `{datawizard}` provides various functions for cleaning and preparing data (see Table 1). | Function | Operation | | :--------------- | :------------------------------------ | | `data_filter()` | to select only certain *observations* | | `data_select()` | to select only certain *variables* | | `data_extract()` | to extract a single *variable* | | `data_rename()` | to rename variables | | `data_to_long()` | to convert data from wide to long | | `data_to_wide()` | to convert data from long to wide | | `data_join()` | to join two data frames | | ... | ... | Table: The table below lists a few key functions offered by `{datawizard}` for data wrangling. To see the full list, see the package website: We will look at one example function that converts data in wide format to tidy/long format: ```{r} stocks <- data.frame( time = as.Date("2009-01-01") + 0:4, X = rnorm(5, 0, 1), Y = rnorm(5, 0, 2) ) stocks data_to_long( stocks, select = -c("time"), names_to = "stock", values_to = "price" ) ``` ## Statistical Transformations Even after getting the raw data in the needed format, we may need to transform certain variables further to meet requirements imposed by a statistical test. `{datawizard}` provides a rich collection of such functions for transforming variables (see Table 2). | Function | Operation | | :---------------- | :------------------------------------------- | | `standardize()` | to center and scale data | | `normalize()` | to scale variables to 0-1 range | | `adjust()` | to adjust data for effect of other variables | | `slide()` | to shift numeric value range | | `ranktransform()` | to convert numeric values to integer ranks | | ... | ... | Table: The table below lists a few key functions offered by `{datawizard}` for data transformations. To see the full list, see the package website: We will look at one example function that standardizes (i.e. centers and scales) data so that it can be expressed in terms of standard deviation: ```{r} d <- data.frame( a = c(-2, -1, 0, 1, 2), b = c(3, 4, 5, 6, 7) ) standardize(d, center = c(3, 4), scale = c(2, 4)) ``` ## Summaries of Data Properties and Distributions The workhorse function to get a comprehensive summary of data properties is `describe_distribution()`, which combines a set of indices (e.g., measures of centrality, dispersion, range, skewness, kurtosis, etc.) computed by other functions in `{datawizard}`. ```{r eval=FALSE} describe_distribution(mtcars) ``` ```{r echo=FALSE, eval=TRUE, results="asis"} library(kableExtra) options(digits = 2) kbl(describe_distribution(mtcars), format = "latex", booktabs = TRUE, linesep = "") ``` # Licensing and Availability `{datawizard}` is licensed under the GNU General Public License (v3.0), with all source code openly developed and stored on GitHub (), along with a corresponding issue tracker for bug reporting and feature enhancements. In the spirit of honest and open science, we encourage requests, tips for fixes, feature updates, as well as general questions and concerns via direct interaction with contributors and developers. # Acknowledgments `{datawizard}` is part of the collaborative [*easystats*](https://easystats.github.io/easystats/) ecosystem. Thus, we thank the [members of easystats](https://github.com/orgs/easystats/people) as well as the users. # References ================================================ FILE: paper/JOSS_files/paper.bib ================================================ @Article{Ben-Shachar2020, title = {{e}ffectsize: Estimation of Effect Size Indices and Standardized Parameters}, author = {Mattan S. Ben-Shachar and Daniel Lüdecke and Dominique Makowski}, year = {2020}, journal = {Journal of Open Source Software}, volume = {5}, number = {56}, pages = {2815}, publisher = {The Open Journal}, doi = {10.21105/joss.02815}, url = {https://doi.org/10.21105/joss.02815}, } @Article{Lüdecke2020parameters, title = {Extracting, Computing and Exploring the Parameters of Statistical Models using {R}.}, volume = {5}, doi = {10.21105/joss.02445}, number = {53}, journal = {Journal of Open Source Software}, author = {Daniel Lüdecke and Mattan S. Ben-Shachar and Indrajeet Patil and Dominique Makowski}, year = {2020}, pages = {2445}, } @Article{Lüdecke2021see, title = {{see}: An {R} Package for Visualizing Statistical Models}, author = {Daniel Lüdecke and Indrajeet Patil and Mattan S. Ben-Shachar and Brenton M. Wiernik and Philip Waggoner and Dominique Makowski}, journal = {Journal of Open Source Software}, year = {2021}, volume = {6}, number = {64}, pages = {3393}, doi = {10.21105/joss.03393}, } @Article{Lüdecke2020performance, title = {{performance}: An {R} Package for Assessment, Comparison and Testing of Statistical Models}, author = {Daniel Lüdecke and Mattan S. Ben-Shachar and Indrajeet Patil and Philip Waggoner and Dominique Makowski}, year = {2021}, journal = {Journal of Open Source Software}, volume = {6}, number = {60}, pages = {3139}, doi = {10.21105/joss.03139}, } @Article{Lüdecke2019, title = {{insight}: A Unified Interface to Access Information from Model Objects in {R}.}, volume = {4}, doi = {10.21105/joss.01412}, number = {38}, journal = {Journal of Open Source Software}, author = {Daniel Lüdecke and Philip Waggoner and Dominique Makowski}, year = {2019}, pages = {1412}, } @Article{Makowski2020, title = {Methods and Algorithms for Correlation Analysis in {R}.}, author = {Dominique Makowski and Mattan S. Ben-Shachar and Indrajeet Patil and Daniel Lüdecke}, doi = {10.21105/joss.02306}, year = {2020}, journal = {Journal of Open Source Software}, number = {51}, volume = {5}, pages = {2306}, url = {https://joss.theoj.org/papers/10.21105/joss.02306}, } @Article{Patil2021, doi = {10.21105/joss.03167}, url = {https://doi.org/10.21105/joss.03167}, year = {2021}, publisher = {{The Open Journal}}, volume = {6}, number = {61}, pages = {3167}, author = {Indrajeet Patil}, title = {{Visualizations with statistical details: The {'ggstatsplot'} approach}}, journal = {{Journal of Open Source Software}}, } @Article{Makowski2019, title = {{bayestestR}: Describing Effects and their Uncertainty, Existence and Significance within the {B}ayesian Framework.}, author = {Dominique Makowski and Mattan S. Ben-Shachar and Daniel Lüdecke}, journal = {Journal of Open Source Software}, doi = {10.21105/joss.01541}, year = {2019}, number = {40}, volume = {4}, pages = {1541}, url = {https://joss.theoj.org/papers/10.21105/joss.01541}, } @Article{Wickham2019, title = {Welcome to the {tidyverse}}, author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani}, year = {2019}, journal = {Journal of Open Source Software}, volume = {4}, number = {43}, pages = {1686}, doi = {10.21105/joss.01686}, } @Article{Makowski2020modelbased, title = {Estimation of Model-Based Predictions, Contrasts and Means.}, author = {Dominique Makowski and Mattan S. Ben-Shachar and Indrajeet Patil and Daniel Lüdecke}, journal = {CRAN}, year = {2020}, url = {https://github.com/easystats/modelbased}, } @Manual{base2021, title = {{R}: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, organization = {R Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2021}, url = {https://www.R-project.org/}, } @Manual{Eastwood2021, title = {poorman: A Poor Man's Dependency Free Recreation of 'dplyr'}, author = {Nathan Eastwood}, year = {2021}, note = {R package version 0.2.5}, url = {https://CRAN.R-project.org/package=poorman}, } @Manual{Dowle2021, title = {data.table: Extension of `data.frame`}, author = {Matt Dowle and Arun Srinivasan}, year = {2021}, note = {R package version 1.14.2}, url = {https://CRAN.R-project.org/package=data.table}, } ================================================ FILE: paper/JOSS_files/paper.log ================================================ This is XeTeX, Version 3.141592653-2.6-0.999994 (TeX Live 2022) (preloaded format=xelatex 2022.9.27) 4 OCT 2022 17:54 entering extended mode restricted \write18 enabled. %&-line parsing enabled. **paper.tex (./paper.tex LaTeX2e <2022-06-01> patch level 5 L3 programming layer <2022-08-30> (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/article.cls Document Class: article 2021/10/04 v1.4n Standard LaTeX document class (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/size10.clo File: size10.clo 2021/10/04 v1.4n Standard LaTeX file (size option) ) \c@part=\count181 \c@section=\count182 \c@subsection=\count183 \c@subsubsection=\count184 \c@paragraph=\count185 \c@subparagraph=\count186 \c@figure=\count187 \c@table=\count188 \abovecaptionskip=\skip47 \belowcaptionskip=\skip48 \bibindent=\dimen138 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/marginnote/marginnote.sty Package: marginnote 2018/08/09 v1.4b non floating margin notes for LaTeX \c@mn@abspage=\count189 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/graphicx.sty Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/keyval.sty Package: keyval 2022/05/29 v1.15 key=value parser (DPC) \KV@toks@=\toks16 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/graphics.sty Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/trig.sty Package: trig 2021/08/11 v1.11 sin cos tan (DPC) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics-cfg/graphics.cfg File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration ) Package graphics Info: Driver file: xetex.def on input line 107. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics-def/xetex.def File: xetex.def 2022/09/22 v5.0n Graphics/color driver for xetex \stockwidth=\dimen139 \stockheight=\dimen140 )) \Gin@req@height=\dimen141 \Gin@req@width=\dimen142 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/xcolor/xcolor.sty Package: xcolor 2022/06/12 v2.14 LaTeX color extensions (UK) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics-cfg/color.cfg File: color.cfg 2016/01/02 v1.6 sample color configuration ) Package xcolor Info: Driver file: xetex.def on input line 227. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/mathcolor.ltx) Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1353. Package xcolor Info: Model `RGB' extended on input line 1369. Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1371. Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1372. Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1373. Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1374. Package xcolor Info: Model `Gray' substituted by `gray' on input line 1375. Package xcolor Info: Model `wave' substituted by `hsb' on input line 1376. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/preprint/authblk.sty Package: authblk 2001/02/27 1.3 (PWD) \affilsep=\skip49 \@affilsep=\skip50 \c@Maxaffil=\count190 \c@authors=\count191 \c@affil=\count192 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/etoolbox/etoolbox.sty Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW) \etb@tempcnta=\count193 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/titlesec/titlesec.sty Package: titlesec 2021/07/05 v2.14 Sectioning titles \ttl@box=\box51 \beforetitleunit=\skip51 \aftertitleunit=\skip52 \ttl@plus=\dimen143 \ttl@minus=\dimen144 \ttl@toksa=\toks17 \titlewidth=\dimen145 \titlewidthlast=\dimen146 \titlewidthfirst=\dimen147 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tools/calc.sty Package: calc 2017/05/25 v4.3 Infix arithmetic (KKT,FJ) \calc@Acount=\count194 \calc@Bcount=\count195 \calc@Adimen=\dimen148 \calc@Bdimen=\dimen149 \calc@Askip=\skip53 \calc@Bskip=\skip54 LaTeX Info: Redefining \setlength on input line 80. LaTeX Info: Redefining \addtolength on input line 81. \calc@Ccount=\count196 \calc@Cskip=\skip55 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex \pgfutil@everybye=\toks18 \pgfutil@tempdima=\dimen150 \pgfutil@tempdimb=\dimen151 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfutil-common-lists.tex)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def \pgfutil@abb=\box52 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/pgf.revision.tex) Package: pgfrcs 2021/05/15 v3.1.9a (3.1.9a) )) Package: pgf 2021/05/15 v3.1.9a (3.1.9a) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex Package: pgfsys 2021/05/15 v3.1.9a (3.1.9a) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex \pgfkeys@pathtoks=\toks19 \pgfkeys@temptoks=\toks20 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeysfiltered.code.tex \pgfkeys@tmptoks=\toks21 )) \pgf@x=\dimen152 \pgf@y=\dimen153 \pgf@xa=\dimen154 \pgf@ya=\dimen155 \pgf@xb=\dimen156 \pgf@yb=\dimen157 \pgf@xc=\dimen158 \pgf@yc=\dimen159 \pgf@xd=\dimen160 \pgf@yd=\dimen161 \w@pgf@writea=\write3 \r@pgf@reada=\read2 \c@pgf@counta=\count197 \c@pgf@countb=\count198 \c@pgf@countc=\count199 \c@pgf@countd=\count266 \t@pgf@toka=\toks22 \t@pgf@tokb=\toks23 \t@pgf@tokc=\toks24 \pgf@sys@id@count=\count267 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg File: pgf.cfg 2021/05/15 v3.1.9a (3.1.9a) ) Driver file for pgf: pgfsys-xetex.def (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-xetex.def File: pgfsys-xetex.def 2021/05/15 v3.1.9a (3.1.9a) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-dvipdfmx.def File: pgfsys-dvipdfmx.def 2021/05/15 v3.1.9a (3.1.9a) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.def File: pgfsys-common-pdf.def 2021/05/15 v3.1.9a (3.1.9a) ) \pgfsys@objnum=\count268 ))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex File: pgfsyssoftpath.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgfsyssoftpath@smallbuffer@items=\count269 \pgfsyssoftpath@bigbuffer@items=\count270 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex File: pgfsysprotocol.code.tex 2021/05/15 v3.1.9a (3.1.9a) )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex Package: pgfcore 2021/05/15 v3.1.9a (3.1.9a) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex \pgfmath@dimen=\dimen162 \pgfmath@count=\count271 \pgfmath@box=\box53 \pgfmath@toks=\toks25 \pgfmath@stack@operand=\toks26 \pgfmath@stack@operation=\toks27 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonometric.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerarithmetics.code.tex))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex \c@pgfmathroundto@lastzeros=\count272 )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.tex File: pgfcorepoints.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgf@picminx=\dimen163 \pgf@picmaxx=\dimen164 \pgf@picminy=\dimen165 \pgf@picmaxy=\dimen166 \pgf@pathminx=\dimen167 \pgf@pathmaxx=\dimen168 \pgf@pathminy=\dimen169 \pgf@pathmaxy=\dimen170 \pgf@xx=\dimen171 \pgf@xy=\dimen172 \pgf@yx=\dimen173 \pgf@yy=\dimen174 \pgf@zx=\dimen175 \pgf@zy=\dimen176 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.code.tex File: pgfcorepathconstruct.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgf@path@lastx=\dimen177 \pgf@path@lasty=\dimen178 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code.tex File: pgfcorepathusage.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgf@shorten@end@additional=\dimen179 \pgf@shorten@start@additional=\dimen180 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.tex File: pgfcorescopes.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgfpic=\box54 \pgf@hbox=\box55 \pgf@layerbox@main=\box56 \pgf@picture@serial@count=\count273 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.code.tex File: pgfcoregraphicstate.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgflinewidth=\dimen181 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformations.code.tex File: pgfcoretransformations.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgf@pt@x=\dimen182 \pgf@pt@y=\dimen183 \pgf@pt@temp=\dimen184 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex File: pgfcorequick.code.tex 2021/05/15 v3.1.9a (3.1.9a) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.tex File: pgfcoreobjects.code.tex 2021/05/15 v3.1.9a (3.1.9a) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing.code.tex File: pgfcorepathprocessing.code.tex 2021/05/15 v3.1.9a (3.1.9a) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.tex File: pgfcorearrows.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgfarrowsep=\dimen185 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex File: pgfcoreshade.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgf@max=\dimen186 \pgf@sys@shading@range@num=\count274 \pgf@shadingcount=\count275 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex File: pgfcoreimage.code.tex 2021/05/15 v3.1.9a (3.1.9a) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.tex File: pgfcoreexternal.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgfexternal@startupbox=\box57 )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.tex File: pgfcorelayers.code.tex 2021/05/15 v3.1.9a (3.1.9a) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.code.tex File: pgfcoretransparency.code.tex 2021/05/15 v3.1.9a (3.1.9a) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.tex File: pgfcorepatterns.code.tex 2021/05/15 v3.1.9a (3.1.9a) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex File: pgfcorerdf.code.tex 2021/05/15 v3.1.9a (3.1.9a) ))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex File: pgfmoduleshapes.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgfnodeparttextbox=\box58 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex File: pgfmoduleplot.code.tex 2021/05/15 v3.1.9a (3.1.9a) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty Package: pgfcomp-version-0-65 2021/05/15 v3.1.9a (3.1.9a) \pgf@nodesepstart=\dimen187 \pgf@nodesepend=\dimen188 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty Package: pgfcomp-version-1-18 2021/05/15 v3.1.9a (3.1.9a) )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgffor.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/math/pgfmath.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex Package: pgffor 2021/05/15 v3.1.9a (3.1.9a) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex) \pgffor@iter=\dimen189 \pgffor@skip=\dimen190 \pgffor@stack=\toks28 \pgffor@toks=\toks29 )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex Package: tikz 2021/05/15 v3.1.9a (3.1.9a) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex File: pgflibraryplothandlers.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgf@plot@mark@count=\count276 \pgfplotmarksize=\dimen191 ) \tikz@lastx=\dimen192 \tikz@lasty=\dimen193 \tikz@lastxsaved=\dimen194 \tikz@lastysaved=\dimen195 \tikz@lastmovetox=\dimen196 \tikz@lastmovetoy=\dimen197 \tikzleveldistance=\dimen198 \tikzsiblingdistance=\dimen199 \tikz@figbox=\box59 \tikz@figbox@bg=\box60 \tikz@tempbox=\box61 \tikz@tempbox@bg=\box62 \tikztreelevel=\count277 \tikznumberofchildren=\count278 \tikznumberofcurrentchild=\count279 \tikz@fig@count=\count280 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex File: pgfmodulematrix.code.tex 2021/05/15 v3.1.9a (3.1.9a) \pgfmatrixcurrentrow=\count281 \pgfmatrixcurrentcolumn=\count282 \pgf@matrix@numberofcolumns=\count283 ) \tikz@expandcount=\count284 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex File: tikzlibrarytopaths.code.tex 2021/05/15 v3.1.9a (3.1.9a) ))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/hyperref.sty Package: hyperref 2022-09-22 v7.00t Hypertext links for LaTeX (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/iftex/iftex.sty Package: iftex 2022/02/03 v1.0f TeX engine tests ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/infwarerr/infwarerr.sty Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO) ) Package pdftexcmds Info: \pdf@primitive is available. Package pdftexcmds Info: \pdf@ifprimitive is available. Package pdftexcmds Info: \pdfdraftmode not found. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/kvsetkeys/kvsetkeys.sty Package: kvsetkeys 2019/12/15 v1.18 Key value parser (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pdfescape/pdfescape.sty Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hycolor/hycolor.sty Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/auxhook/auxhook.sty Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/nameref.sty Package: nameref 2022-05-17 v2.50 Cross-referencing by name of section (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/refcount/refcount.sty Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/kvoptions/kvoptions.sty Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO) )) \c@section@level=\count285 ) \@linkdim=\dimen256 \Hy@linkcounter=\count286 \Hy@pagecounter=\count287 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/pd1enc.def File: pd1enc.def 2022-09-22 v7.00t Hyperref: PDFDocEncoding definition (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/intcalc/intcalc.sty Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/etexcmds/etexcmds.sty Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO) ) \Hy@SavedSpaceFactor=\count288 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/puenc.def File: puenc.def 2022-09-22 v7.00t Hyperref: PDF Unicode definition (HO) ) Package hyperref Info: Hyper figures OFF on input line 4162. Package hyperref Info: Link nesting OFF on input line 4167. Package hyperref Info: Hyper index ON on input line 4170. Package hyperref Info: Plain pages OFF on input line 4177. Package hyperref Info: Backreferencing OFF on input line 4182. Package hyperref Info: Implicit mode ON; LaTeX internals redefined. Package hyperref Info: Bookmarks ON on input line 4410. \c@Hy@tempcnt=\count289 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/url/url.sty \Urlmuskip=\muskip16 Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc. ) LaTeX Info: Redefining \url on input line 4748. \XeTeXLinkMargin=\dimen257 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/bitset/bitset.sty Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO) )) \Fld@menulength=\count290 \Field@Width=\dimen258 \Fld@charsize=\dimen259 Package hyperref Info: Hyper figures OFF on input line 6027. Package hyperref Info: Link nesting OFF on input line 6032. Package hyperref Info: Hyper index ON on input line 6035. Package hyperref Info: backreferencing OFF on input line 6042. Package hyperref Info: Link coloring OFF on input line 6047. Package hyperref Info: Link coloring with OCG OFF on input line 6052. Package hyperref Info: PDF/A mode OFF on input line 6057. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/atbegshi-ltx.sty Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi package with kernel methods ) \Hy@abspage=\count291 \c@Item=\count292 \c@Hfootnote=\count293 ) Package hyperref Info: Driver (autodetected): hxetex. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/hxetex.def File: hxetex.def 2022-09-22 v7.00t Hyperref driver for XeTeX (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/stringenc/stringenc.sty Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO) ) \pdfm@box=\box63 \c@Hy@AnnotLevel=\count294 \HyField@AnnotCount=\count295 \Fld@listcount=\count296 \c@bookmark@seq@number=\count297 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/atveryend-ltx.sty Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend package with kernel methods ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO) ) Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 285. ) \Hy@SectionHShift=\skip56 ) Package hyperref Info: Option `colorlinks' set `true' on input line 12. Package hyperref Info: Option `breaklinks' set `true' on input line 12. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/caption/caption.sty Package: caption 2022/03/01 v3.6b Customizing captions (AR) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/caption/caption3.sty Package: caption3 2022/03/17 v2.3b caption3 kernel (AR) \caption@tempdima=\dimen260 \captionmargin=\dimen261 \caption@leftmargin=\dimen262 \caption@rightmargin=\dimen263 \caption@width=\dimen264 \caption@indent=\dimen265 \caption@parindent=\dimen266 \caption@hangindent=\dimen267 Package caption Info: Standard document class detected. ) \c@caption@flags=\count298 \c@continuedfloat=\count299 Package caption Info: hyperref package is loaded. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tcolorbox/tcolorbox.sty Package: tcolorbox 2022/06/24 version 5.1.1 text color boxes (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tools/verbatim.sty Package: verbatim 2020-07-07 v1.5u LaTeX2e package for verbatim enhancements \every@verbatim=\toks30 \verbatim@line=\toks31 \verbatim@in@stream=\read3 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/environ/environ.sty Package: environ 2014/05/04 v0.3 A new way to define environments (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/trimspaces/trimspaces.sty Package: trimspaces 2009/09/17 v1.1 Trim spaces around a token list ) \@envbody=\toks32 ) \tcb@titlebox=\box64 \tcb@upperbox=\box65 \tcb@lowerbox=\box66 \tcb@phantombox=\box67 \c@tcbbreakpart=\count300 \c@tcblayer=\count301 \c@tcolorbox@number=\count302 \tcb@temp=\box68 \tcb@temp=\box69 \tcb@temp=\box70 \tcb@temp=\box71 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsfonts/amssymb.sty Package: amssymb 2013/01/14 v3.01 AMS font symbols (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsfonts/amsfonts.sty Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support \@emptytoks=\toks33 \symAMSa=\mathgroup4 \symAMSb=\mathgroup5 LaTeX Font Info: Redeclaring math symbol \hbar on input line 98. LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' (Font) U/euf/m/n --> U/euf/b/n on input line 106. )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amsmath.sty Package: amsmath 2022/04/08 v2.17n AMS math features \@mathmargin=\skip57 For additional information on amsmath, use the `?' option. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amstext.sty Package: amstext 2021/08/26 v2.01 AMS text (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amsgen.sty File: amsgen.sty 1999/11/30 v2.0 generic functions \@emptytoks=\toks34 \ex@=\dimen268 )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amsbsy.sty Package: amsbsy 1999/11/29 v1.2d Bold Symbols \pmbraise@=\dimen269 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amsopn.sty Package: amsopn 2022/04/08 v2.04 operator names ) \inf@bad=\count303 LaTeX Info: Redefining \frac on input line 234. \uproot@=\count304 \leftroot@=\count305 LaTeX Info: Redefining \overline on input line 399. LaTeX Info: Redefining \colon on input line 410. \classnum@=\count306 \DOTSCASE@=\count307 LaTeX Info: Redefining \ldots on input line 496. LaTeX Info: Redefining \dots on input line 499. LaTeX Info: Redefining \cdots on input line 620. \Mathstrutbox@=\box72 \strutbox@=\box73 LaTeX Info: Redefining \big on input line 722. LaTeX Info: Redefining \Big on input line 723. LaTeX Info: Redefining \bigg on input line 724. LaTeX Info: Redefining \Bigg on input line 725. \big@size=\dimen270 LaTeX Font Info: Redeclaring font encoding OML on input line 743. LaTeX Font Info: Redeclaring font encoding OMS on input line 744. \macc@depth=\count308 LaTeX Info: Redefining \bmod on input line 905. LaTeX Info: Redefining \pmod on input line 910. LaTeX Info: Redefining \smash on input line 940. LaTeX Info: Redefining \relbar on input line 970. LaTeX Info: Redefining \Relbar on input line 971. \c@MaxMatrixCols=\count309 \dotsspace@=\muskip17 \c@parentequation=\count310 \dspbrk@lvl=\count311 \tag@help=\toks35 \row@=\count312 \column@=\count313 \maxfields@=\count314 \andhelp@=\toks36 \eqnshift@=\dimen271 \alignsep@=\dimen272 \tagshift@=\dimen273 \tagwidth@=\dimen274 \totwidth@=\dimen275 \lineht@=\dimen276 \@envbody=\toks37 \multlinegap=\skip58 \multlinetaggap=\skip59 \mathdisplay@stack=\toks38 LaTeX Info: Redefining \[ on input line 2953. LaTeX Info: Redefining \] on input line 2954. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/iftex/ifxetex.sty Package: ifxetex 2019/10/25 v0.7 ifxetex legacy package. Use iftex instead. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/iftex/ifluatex.sty Package: ifluatex 2019/10/25 v1.5 ifluatex legacy package. Use iftex instead. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/seqsplit/seqsplit.sty Package: seqsplit 2006/08/07 v0.1 Splitting long sequences (DNA, RNA, proteins, etc.) ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/fixltx2e.sty Package: fixltx2e 2016/12/29 v2.1a fixes to LaTeX (obsolete) Applying: [2015/01/01] Old fixltx2e package on input line 46. Package fixltx2e Warning: fixltx2e is not required with releases after 2015 (fixltx2e) All fixes are now in the LaTeX kernel. (fixltx2e) See the latexrelease package for details. Already applied: [0000/00/00] Old fixltx2e package on input line 53. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.sty Package: biblatex 2022/07/12 v3.18b programmable bibliographies (PK/MW) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/logreq/logreq.sty Package: logreq 2010/08/04 v1.0 xml request logger \lrq@indent=\count315 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/logreq/logreq.def File: logreq.def 2010/08/04 v1.0 logreq spec v1.0 )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/ifthen.sty Package: ifthen 2022/04/13 v1.1d Standard LaTeX ifthen package (DPC) ) \c@tabx@nest=\count316 \c@listtotal=\count317 \c@listcount=\count318 \c@liststart=\count319 \c@liststop=\count320 \c@citecount=\count321 \c@citetotal=\count322 \c@multicitecount=\count323 \c@multicitetotal=\count324 \c@instcount=\count325 \c@maxnames=\count326 \c@minnames=\count327 \c@maxitems=\count328 \c@minitems=\count329 \c@citecounter=\count330 \c@maxcitecounter=\count331 \c@savedcitecounter=\count332 \c@uniquelist=\count333 \c@uniquename=\count334 \c@refsection=\count335 \c@refsegment=\count336 \c@maxextratitle=\count337 \c@maxextratitleyear=\count338 \c@maxextraname=\count339 \c@maxextradate=\count340 \c@maxextraalpha=\count341 \c@abbrvpenalty=\count342 \c@highnamepenalty=\count343 \c@lownamepenalty=\count344 \c@maxparens=\count345 \c@parenlevel=\count346 \blx@tempcnta=\count347 \blx@tempcntb=\count348 \blx@tempcntc=\count349 \c@blx@maxsection=\count350 \blx@maxsegment@0=\count351 \blx@notetype=\count352 \blx@parenlevel@text=\count353 \blx@parenlevel@foot=\count354 \blx@sectionciteorder@0=\count355 \blx@sectionciteorderinternal@0=\count356 \blx@entrysetcounter=\count357 \blx@biblioinstance=\count358 \labelnumberwidth=\skip60 \labelalphawidth=\skip61 \biblabelsep=\skip62 \bibitemsep=\skip63 \bibnamesep=\skip64 \bibinitsep=\skip65 \bibparsep=\skip66 \bibhang=\skip67 \blx@bcfin=\read4 \blx@bcfout=\write4 \blx@langwohyphens=\language3 \c@mincomprange=\count359 \c@maxcomprange=\count360 \c@mincompwidth=\count361 Package biblatex Info: Trying to load biblatex default data model... Package biblatex Info: ... file 'blx-dm.def' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/blx-dm.def File: blx-dm.def 2022/07/12 v3.18b biblatex localization (PK/MW) ) Package biblatex Info: Trying to load biblatex custom data model... Package biblatex Info: ... file 'biblatex-dm.cfg' not found. \c@afterword=\count362 \c@savedafterword=\count363 \c@annotator=\count364 \c@savedannotator=\count365 \c@author=\count366 \c@savedauthor=\count367 \c@bookauthor=\count368 \c@savedbookauthor=\count369 \c@commentator=\count370 \c@savedcommentator=\count371 \c@editor=\count372 \c@savededitor=\count373 \c@editora=\count374 \c@savededitora=\count375 \c@editorb=\count376 \c@savededitorb=\count377 \c@editorc=\count378 \c@savededitorc=\count379 \c@foreword=\count380 \c@savedforeword=\count381 \c@holder=\count382 \c@savedholder=\count383 \c@introduction=\count384 \c@savedintroduction=\count385 \c@namea=\count386 \c@savednamea=\count387 \c@nameb=\count388 \c@savednameb=\count389 \c@namec=\count390 \c@savednamec=\count391 \c@translator=\count392 \c@savedtranslator=\count393 \c@shortauthor=\count394 \c@savedshortauthor=\count395 \c@shorteditor=\count396 \c@savedshorteditor=\count397 \c@labelname=\count398 \c@savedlabelname=\count399 \c@institution=\count400 \c@savedinstitution=\count401 \c@lista=\count402 \c@savedlista=\count403 \c@listb=\count404 \c@savedlistb=\count405 \c@listc=\count406 \c@savedlistc=\count407 \c@listd=\count408 \c@savedlistd=\count409 \c@liste=\count410 \c@savedliste=\count411 \c@listf=\count412 \c@savedlistf=\count413 \c@location=\count414 \c@savedlocation=\count415 \c@organization=\count416 \c@savedorganization=\count417 \c@origlocation=\count418 \c@savedoriglocation=\count419 \c@origpublisher=\count420 \c@savedorigpublisher=\count421 \c@publisher=\count422 \c@savedpublisher=\count423 \c@language=\count424 \c@savedlanguage=\count425 \c@origlanguage=\count426 \c@savedoriglanguage=\count427 \c@pageref=\count428 \c@savedpageref=\count429 \shorthandwidth=\skip68 \shortjournalwidth=\skip69 \shortserieswidth=\skip70 \shorttitlewidth=\skip71 \shortauthorwidth=\skip72 \shorteditorwidth=\skip73 \locallabelnumberwidth=\skip74 \locallabelalphawidth=\skip75 \localshorthandwidth=\skip76 \localshortjournalwidth=\skip77 \localshortserieswidth=\skip78 \localshorttitlewidth=\skip79 \localshortauthorwidth=\skip80 \localshorteditorwidth=\skip81 Package biblatex Info: Trying to load enhanced support for Unicode engines... Package biblatex Info: ... file 'blx-unicode.def' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/blx-unicode.def) Package biblatex Info: Trying to load compatibility code... Package biblatex Info: ... file 'blx-compat.def' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/blx-compat.def File: blx-compat.def 2022/07/12 v3.18b biblatex compatibility (PK/MW) ) Package biblatex Info: Trying to load generic definitions... Package biblatex Info: ... file 'biblatex.def' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.def File: biblatex.def 2022/07/12 v3.18b biblatex compatibility (PK/MW) \c@textcitecount=\count430 \c@textcitetotal=\count431 \c@textcitemaxnames=\count432 \c@biburlbigbreakpenalty=\count433 \c@biburlbreakpenalty=\count434 \c@biburlnumpenalty=\count435 \c@biburlucpenalty=\count436 \c@biburllcpenalty=\count437 \biburlbigskip=\muskip18 \biburlnumskip=\muskip19 \biburlucskip=\muskip20 \biburllcskip=\muskip21 \c@smartand=\count438 ) Package biblatex Info: Trying to load bibliography style 'numeric'... Package biblatex Info: ... file 'numeric.bbx' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/bbx/numeric.bbx File: numeric.bbx 2022/07/12 v3.18b biblatex bibliography style (PK/MW) Package biblatex Info: Trying to load bibliography style 'standard'... Package biblatex Info: ... file 'standard.bbx' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/bbx/standard.bbx File: standard.bbx 2022/07/12 v3.18b biblatex bibliography style (PK/MW) \c@bbx:relatedcount=\count439 \c@bbx:relatedtotal=\count440 )) Package biblatex Info: Trying to load citation style 'numeric'... Package biblatex Info: ... file 'numeric.cbx' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/cbx/numeric.cbx File: numeric.cbx 2022/07/12 v3.18b biblatex citation style (PK/MW) Package biblatex Info: Redefining '\cite'. Package biblatex Info: Redefining '\parencite'. Package biblatex Info: Redefining '\footcite'. Package biblatex Info: Redefining '\footcitetext'. Package biblatex Info: Redefining '\smartcite'. Package biblatex Info: Redefining '\supercite'. Package biblatex Info: Redefining '\textcite'. Package biblatex Info: Redefining '\textcites'. Package biblatex Info: Redefining '\cites'. Package biblatex Info: Redefining '\parencites'. Package biblatex Info: Redefining '\smartcites'. ) Package biblatex Info: Trying to load configuration file... Package biblatex Info: ... file 'biblatex.cfg' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.cfg File: biblatex.cfg ) Package biblatex Info: XeTeX detected. (biblatex) Assuming input encoding 'utf8'. Package biblatex Info: Document encoding is UTF8 .... (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/l3kernel/expl3.sty Package: expl3 2022-08-30 L3 programming layer (loader) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/l3backend/l3backend-xetex.def File: l3backend-xetex.def 2022-08-30 L3 backend support: XeTeX \g__graphics_track_int=\count441 \l__pdf_internal_box=\box74 \g__pdf_backend_object_int=\count442 \g__pdf_backend_annotation_int=\count443 \g__pdf_backend_link_int=\count444 )) Package biblatex Info: ... and expl3 (biblatex) 2022-08-30 L3 programming layer (loader) (biblatex) is new enough (at least 2020/04/06), (biblatex) setting 'casechanger=expl3'. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/blx-case-expl3.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/l3packages/xparse/xparse.sty Package: xparse 2022-06-22 L3 Experimental document command parser ) Package: blx-case-expl3 2022/07/12 v3.18b expl3 case changing code for biblatex )) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/geometry/geometry.sty Package: geometry 2020/01/02 v5.9 Page Geometry (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/iftex/ifvtex.sty Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead. ) \Gm@cnth=\count445 \Gm@cntv=\count446 \c@Gm@tempcnt=\count447 \Gm@bindingoffset=\dimen277 \Gm@wd@mp=\dimen278 \Gm@odd@mp=\dimen279 \Gm@even@mp=\dimen280 \Gm@layoutwidth=\dimen281 \Gm@layoutheight=\dimen282 \Gm@layouthoffset=\dimen283 \Gm@layoutvoffset=\dimen284 \Gm@dimlist=\toks39 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fancyhdr/fancyhdr.sty Package: fancyhdr 2022/05/18 v4.0.3 Extensive control of page headers and footers \f@nch@headwidth=\skip82 \f@nch@O@elh=\skip83 \f@nch@O@erh=\skip84 \f@nch@O@olh=\skip85 \f@nch@O@orh=\skip86 \f@nch@O@elf=\skip87 \f@nch@O@erf=\skip88 \f@nch@O@olf=\skip89 \f@nch@O@orf=\skip90 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/xelatex/mathspec/mathspec.sty Package: mathspec 2016/12/22 v0.2b LaTeX Package (Mathematics font selection for XeLaTeX) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec.sty Package: fontspec 2022/01/15 v2.8a Font selection for XeLaTeX and LuaLaTeX (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty Package: fontspec-xetex 2022/01/15 v2.8a Font selection for XeLaTeX and LuaLaTeX \l__fontspec_script_int=\count448 \l__fontspec_language_int=\count449 \l__fontspec_strnum_int=\count450 \l__fontspec_tmp_int=\count451 \l__fontspec_tmpa_int=\count452 \l__fontspec_tmpb_int=\count453 \l__fontspec_tmpc_int=\count454 \l__fontspec_em_int=\count455 \l__fontspec_emdef_int=\count456 \l__fontspec_strong_int=\count457 \l__fontspec_strongdef_int=\count458 \l__fontspec_tmpa_dim=\dimen285 \l__fontspec_tmpb_dim=\dimen286 \l__fontspec_tmpc_dim=\dimen287 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/fontenc.sty Package: fontenc 2021/04/29 v2.0v Standard LaTeX package ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/xkeyval/xkeyval.sty Package: xkeyval 2022/06/16 v2.9 package option processing (HA) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/xkeyval/xkeyval.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/xkeyval/xkvutils.tex \XKV@toks=\toks40 \XKV@tempa@toks=\toks41 ) \XKV@depth=\count459 File: xkeyval.tex 2014/12/03 v2.7a key=value parser (HA) )) \c@eu@=\count460 \c@eu@i=\count461 \c@mkern=\count462 ) Package hyperref Info: Option `unicode' set `true' on input line 151. Package hyperref Info: Option `breaklinks' set `true' on input line 151. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/grffile/grffile.sty Package: grffile 2019/11/11 v2.1 Extended file name support for graphics (legacy) Package grffile Info: This package is an empty stub for compatibility on input line 40. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fancyvrb/fancyvrb.sty Package: fancyvrb 2022/06/06 4.5 verbatim text (tvz,hv) \FV@CodeLineNo=\count463 \FV@InFile=\read5 \FV@TabBox=\box75 \c@FancyVerbLine=\count464 \FV@StepNumber=\count465 \FV@OutFile=\write5 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/framed/framed.sty Package: framed 2011/10/22 v 0.96: framed or shaded text with page breaks \OuterFrameSep=\skip91 \fb@frw=\dimen288 \fb@frh=\dimen289 \FrameRule=\dimen290 \FrameSep=\dimen291 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tools/longtable.sty Package: longtable 2021-09-01 v4.17 Multi-page Table package (DPC) \LTleft=\skip92 \LTright=\skip93 \LTpre=\skip94 \LTpost=\skip95 \LTchunksize=\count466 \LTcapwidth=\dimen292 \LT@head=\box76 \LT@firsthead=\box77 \LT@foot=\box78 \LT@lastfoot=\box79 \LT@gbox=\box80 \LT@cols=\count467 \LT@rows=\count468 \c@LT@tables=\count469 \c@LT@chunks=\count470 \LT@p@ftn=\toks42 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/booktabs/booktabs.sty Package: booktabs 2020/01/12 v1.61803398 Publication quality tables \heavyrulewidth=\dimen293 \lightrulewidth=\dimen294 \cmidrulewidth=\dimen295 \belowrulesep=\dimen296 \belowbottomsep=\dimen297 \aboverulesep=\dimen298 \abovetopsep=\dimen299 \cmidrulesep=\dimen300 \cmidrulekern=\dimen301 \defaultaddspace=\dimen302 \@cmidla=\count471 \@cmidlb=\count472 \@aboverulesep=\dimen303 \@belowrulesep=\dimen304 \@thisruleclass=\count473 \@lastruleclass=\count474 \@thisrulewidth=\dimen305 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tools/array.sty Package: array 2022/03/10 v2.5f Tabular extension package (FMi) \col@sep=\dimen306 \ar@mcellbox=\box81 \extrarowheight=\dimen307 \NC@list=\toks43 \extratabsurround=\skip96 \backup@length=\skip97 \ar@cellbox=\box82 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/mdwtools/footnote.sty Package: footnote 1997/01/28 1.13 Save footnotes around boxes \fn@notes=\box83 \fn@width=\dimen308 ) \cslhangindent=\skip98 \csllabelwidth=\skip99 \cslentryspacingunit=\skip100 (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/multirow/multirow.sty Package: multirow 2021/03/15 v2.8 Span multiple rows of a table \multirow@colwidth=\skip101 \multirow@cntb=\count475 \multirow@dima=\skip102 \bigstrutjot=\dimen309 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/wrapfig/wrapfig.sty \wrapoverhang=\dimen310 \WF@size=\dimen311 \c@WF@wrappedlines=\count476 \WF@box=\box84 \WF@everypar=\toks44 Package: wrapfig 2003/01/31 v 3.6 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/float/float.sty Package: float 2001/11/08 v1.3d Float enhancements (AL) \c@float@type=\count477 \float@exts=\toks45 \float@box=\box85 \@float@everytoks=\toks46 \@floatcapt=\box86 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/colortbl/colortbl.sty Package: colortbl 2022/06/20 v1.0f Color table columns (DPC) \everycr=\toks47 \minrowclearance=\skip103 \rownum=\count478 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pdflscape/pdflscape.sty Package: pdflscape 2019/12/05 v0.12 Display of landscape pages in PDF (HO) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/lscape.sty Package: lscape 2020/05/28 v3.02 Landscape Pages (DPC) ) Package pdflscape Info: Auto-detected driver: dvipdfm (xetex) on input line 98. ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tabu/tabu.sty Package: tabu 2019/01/11 v2.9 - flexible LaTeX tabulars (FC+tabu-fixed) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/varwidth/varwidth.sty Package: varwidth 2009/03/30 ver 0.92; Variable-width minipages \@vwid@box=\box87 \sift@deathcycles=\count479 \@vwid@loff=\dimen312 \@vwid@roff=\dimen313 ) \c@taburow=\count480 \tabu@nbcols=\count481 \tabu@cnt=\count482 \tabu@Xcol=\count483 \tabu@alloc=\count484 \tabu@nested=\count485 \tabu@target=\dimen314 \tabu@spreadtarget=\dimen315 \tabu@naturalX=\dimen316 \tabucolX=\dimen317 \tabu@Xsum=\dimen318 \extrarowdepth=\dimen319 \abovetabulinesep=\dimen320 \belowtabulinesep=\dimen321 \tabustrutrule=\dimen322 \tabu@thebody=\toks48 \tabu@footnotes=\toks49 \tabu@box=\box88 \tabu@arstrutbox=\box89 \tabu@hleads=\box90 \tabu@vleads=\box91 \tabu@cellskip=\skip104 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/threeparttable/threeparttable.sty Package: threeparttable 2003/06/13 v 3.0 \@tempboxb=\box92 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/threeparttablex/threeparttablex.sty Package: threeparttablex 2013/07/23 v0.3 by daleif \TPTL@width=\skip105 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/ulem/ulem.sty \UL@box=\box93 \UL@hyphenbox=\box94 \UL@skip=\skip106 \UL@hook=\toks50 \UL@height=\dimen323 \UL@pe=\count486 \UL@pixel=\dimen324 \ULC@box=\box95 Package: ulem 2019/11/18 \ULdepth=\dimen325 ) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/makecell/makecell.sty Package: makecell 2009/08/03 V0.1e Managing of Tab Column Heads and Cells \rotheadsize=\dimen326 \c@nlinenum=\count487 \TeXr@lab=\toks51 ) \@quotelevel=\count488 \@quotereset=\count489 (./paper.aux) \openout1 = `paper.aux'. LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 305. LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 305. LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 305. LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 305. LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 305. LaTeX Font Info: Trying to load font information for TS1+cmr on input line 305. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/ts1cmr.fd File: ts1cmr.fd 2019/12/16 v2.5j Standard LaTeX font definitions ) LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 305. LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 305. LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 305. LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 305. LaTeX Font Info: ... okay on input line 305. LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 305. LaTeX Font Info: ... okay on input line 305. Package hyperref Info: Link coloring ON on input line 305. (./paper.out) (./paper.out) \@outlinefile=\write6 \openout6 = `paper.out'. Package caption Info: Begin \AtBeginDocument code. Package caption Info: float package is loaded. Package caption Info: longtable package is loaded. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/caption/ltcaption.sty Package: ltcaption 2021/01/08 v1.4c longtable captions (AR) ) Package caption Info: threeparttable package is loaded. Package caption Info: wrapfig package is loaded. Package caption Info: End \AtBeginDocument code. Package biblatex Info: Trying to load language 'english'... Package biblatex Info: ... file 'english.lbx' found. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/lbx/english.lbx File: english.lbx 2022/07/12 v3.18b biblatex localization (PK/MW) ) Package biblatex Info: XeTeX detected. (biblatex) Assuming input encoding 'utf8'. Package biblatex Info: Automatic encoding selection. (biblatex) Assuming data encoding 'utf8'. \openout4 = `paper.bcf'. Package biblatex Info: Trying to load bibliographic data... Package biblatex Info: ... file 'paper.bbl' not found. No file paper.bbl. Package biblatex Info: Reference section=0 on input line 305. Package biblatex Info: Reference segment=0 on input line 305. *geometry* driver: auto-detecting *geometry* detected driver: xetex *geometry* verbose mode - [ preamble ] result: * driver: xetex * paper: a4paper * layout: * layoutoffset:(h,v)=(0.0pt,0.0pt) * modes: includemp * h-part:(L,W,R)=(28.45274pt, 526.376pt, 42.67912pt) * v-part:(T,H,B)=(99.58464pt, 660.10394pt, 85.35826pt) * \paperwidth=597.50787pt * \paperheight=845.04684pt * \textwidth=387.33861pt * \textheight=660.10394pt * \oddsidemargin=95.22015pt * \evensidemargin=95.22015pt * \topmargin=-60.28131pt * \headheight=62.59596pt * \headsep=25.0pt * \topskip=10.0pt * \footskip=30.0pt * \marginparwidth=128.0374pt * \marginparsep=11.0pt * \columnsep=10.0pt * \skip\footins=9.0pt plus 4.0pt minus 2.0pt * \hoffset=0.0pt * \voffset=0.0pt * \mag=1000 * \@twocolumnfalse * \@twosidefalse * \@mparswitchfalse * \@reversemargintrue * (1in=72.27pt=25.4mm, 1cm=28.453pt) LaTeX Font Info: Trying to load font information for U+msa on input line 306. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsfonts/umsa.fd File: umsa.fd 2013/01/14 v3.01 AMS symbols A ) LaTeX Font Info: Trying to load font information for U+msb on input line 306. (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsfonts/umsb.fd File: umsb.fd 2013/01/14 v3.01 AMS symbols B ) Package hyperref Warning: Suppressing link with empty target on input line 332. Package hyperref Warning: Suppressing link with empty target on input line 332. Package hyperref Warning: Suppressing link with empty target on input line 332. File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp) Package fancyhdr Warning: \headheight is too small (62.59596pt): (fancyhdr) Make it at least 63.55022pt, for example: (fancyhdr) \setlength{\headheight}{63.55022pt}. (fancyhdr) You might also make \topmargin smaller to compensate: (fancyhdr) \addtolength{\topmargin}{-0.95425pt}. LaTeX Font Info: Font shape `TU/lmss/m/it' in size <8> not available (Font) Font shape `TU/lmss/m/sl' tried instead on input line 393. [1 ] File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp) Package fancyhdr Warning: \headheight is too small (62.59596pt): (fancyhdr) Make it at least 63.55022pt, for example: (fancyhdr) \setlength{\headheight}{63.55022pt}. (fancyhdr) You might also make \topmargin smaller to compensate: (fancyhdr) \addtolength{\topmargin}{-0.95425pt}. [2] Underfull \hbox (badness 1448) in paragraph at lines 529--533 \TU/lmr/m/n/10 The workhorse function to get a comprehensive summary of data properties is [] File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp) Package fancyhdr Warning: \headheight is too small (62.59596pt): (fancyhdr) Make it at least 63.55022pt, for example: (fancyhdr) \setlength{\headheight}{63.55022pt}. (fancyhdr) You might also make \topmargin smaller to compensate: (fancyhdr) \addtolength{\topmargin}{-0.95425pt}. [3] Overfull \hbox (25.8514pt too wide) in paragraph at lines 540--557 [][] [] File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp) Package fancyhdr Warning: \headheight is too small (62.59596pt): (fancyhdr) Make it at least 63.55022pt, for example: (fancyhdr) \setlength{\headheight}{63.55022pt}. (fancyhdr) You might also make \topmargin smaller to compensate: (fancyhdr) \addtolength{\topmargin}{-0.95425pt}. [4] File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp) Package fancyhdr Warning: \headheight is too small (62.59596pt): (fancyhdr) Make it at least 63.55022pt, for example: (fancyhdr) \setlength{\headheight}{63.55022pt}. (fancyhdr) You might also make \topmargin smaller to compensate: (fancyhdr) \addtolength{\topmargin}{-0.95425pt}. [5] (./paper.aux) Package rerunfilecheck Info: File `paper.out' has not changed. (rerunfilecheck) Checksum: BFFBA38CB5FAA9461C3119C411944BB9;1328. Package logreq Info: Writing requests to 'paper.run.xml'. \openout1 = `paper.run.xml'. ) Here is how much of TeX's memory you used: 36661 strings out of 477747 751496 string characters out of 5842604 1543570 words of memory out of 5000000 57125 multiletter control sequences out of 15000+600000 477790 words of font info for 84 fonts, out of 8000000 for 9000 14 hyphenation exceptions out of 8191 84i,13n,81p,678b,848s stack positions out of 10000i,1000n,20000p,200000b,200000s Output written on paper.pdf (5 pages). ================================================ FILE: paper/JOSS_files/paper.md ================================================ --- title: "datawizard: An R Package for Easy Data Preparation and Statistical Transformations" tags: - R - easystats authors: - affiliation: 1 name: Indrajeet Patil orcid: 0000-0003-1995-6531 - affiliation: 2 name: Dominique Makowski orcid: 0000-0001-5375-9967 - affiliation: 3 name: Mattan S. Ben-Shachar orcid: 0000-0002-4287-4801 - affiliation: 4 name: Brenton M. Wiernik^[Brenton Wiernik is currently an independent researcher and Research Scientist at Meta, Demography and Survey Science. The current work was done in an independent capacity.] orcid: 0000-0001-9560-6336 - affiliation: 5 name: Etienne Bacher orcid: 0000-0002-9271-5075 - affiliation: 6 name: Daniel Lüdecke orcid: 0000-0002-8895-3206 affiliations: - index: 1 name: cynkra Analytics GmbH, Germany - index: 2 name: Nanyang Technological University, Singapore - index: 3 name: Ben-Gurion University of the Negev, Israel - index: 4 name: Independent Researcher - index: 5 name: Luxembourg Institute of Socio-Economic Research (LISER), Luxembourg - index: 6 name: University Medical Center Hamburg-Eppendorf, Germany date: "2022-10-04" bibliography: paper.bib output: rticles::joss_article csl: apa.csl journal: JOSS link-citations: yes --- # Summary The `{datawizard}` package for the R programming language [@base2021] provides a lightweight toolbox to assist in key steps involved in any data analysis workflow: (1) wrangling the raw data to get it in the needed form, (2) applying preprocessing steps and statistical transformations, and (3) compute statistical summaries of data properties and distributions. Therefore, it can be a valuable tool for R users and developers looking for a lightweight option for data preparation. # Statement of Need The `{datawizard}` package is part of `{easystats}`, a collection of R packages designed to make statistical analysis easier (@Ben-Shachar2020, @Lüdecke2020parameters, @Lüdecke2020performance, @Lüdecke2021see, @Lüdecke2019, @Makowski2019, @Makowski2020). As this ecosystem follows a "0-external-hard-dependency" policy, a data manipulation package that relies only on base R needed to be created. In effect, `{datawizard}` provides a data processing backend for this entire ecosystem. In addition to its usefulness to the `{easystats}` ecosystem, it also provides *an* option for R users and package developers if they wish to keep their (recursive) dependency weight to a minimum (for other options, see @Dowle2021, @Eastwood2021). Because `{datawizard}` is also meant to be used and adopted easily by a wide range of users, its workflow and syntax are designed to be similar to `{tidyverse}` [@Wickham2019], a widely used ecosystem of R packages. Thus, users familiar with the `{tidyverse}` can easily translate their knowledge and make full use of `{datawizard}`. In addition to being a lightweight solution to clean messy data, `{datawizard}` also provides helpers for the other important step of data analysis: applying statistical transformations to the cleaned data while setting up statistical models. This includes various types of data standardization, normalization, rank-transformation, and adjustment. These transformations, although widely used, are not currently collectively implemented in a package in the R ecosystem, so `{datawizard}` can help new R users in finding the transformation they need. Lastly, `{datawizard}` also provides a toolbox to create detailed summaries of data properties and distributions (e.g., tables of descriptive statistics for each variable). This is a common step in data analysis, but it is not available in base R or many modeling packages, so its inclusion makes `{datawizard}` a one-stop-shop for data preparation tasks. # Features ## Data Preparation The raw data is rarely in a state that it can be directly fed into a statistical model. It often needs to be modified in various ways. For example, columns need to be renamed or reshaped, certain portions of the data need to be filtered out, data scattered across multiple tables needs to be joined, etc. `{datawizard}` provides various functions for cleaning and preparing data (see Table 1). | Function | Operation | | :--------------- | :------------------------------------ | | `data_filter()` | to select only certain *observations* | | `data_select()` | to select only certain *variables* | | `data_extract()` | to extract a single *variable* | | `data_rename()` | to rename variables | | `data_to_long()` | to convert data from wide to long | | `data_to_wide()` | to convert data from long to wide | | `data_join()` | to join two data frames | | ... | ... | Table: The table below lists a few key functions offered by `{datawizard}` for data wrangling. To see the full list, see the package website: We will look at one example function that converts data in wide format to tidy/long format: ```r stocks <- data.frame( time = as.Date("2009-01-01") + 0:4, X = rnorm(5, 0, 1), Y = rnorm(5, 0, 2) ) stocks #> time X Y #> 1 2009-01-01 -0.91474184 -0.5654808 #> 2 2009-01-02 1.00124785 -1.5270177 #> 3 2009-01-03 -0.05642291 -1.3700199 #> 4 2009-01-04 0.29664516 0.7341479 #> 5 2009-01-05 -2.79147086 0.3659937 data_to_long( stocks, select = -c("time"), names_to = "stock", values_to = "price" ) #> time stock price #> 1 2009-01-01 X -0.91474184 #> 2 2009-01-01 Y -0.56548082 #> 3 2009-01-02 X 1.00124785 #> 4 2009-01-02 Y -1.52701766 #> 5 2009-01-03 X -0.05642291 #> 6 2009-01-03 Y -1.37001987 #> 7 2009-01-04 X 0.29664516 #> 8 2009-01-04 Y 0.73414790 #> 9 2009-01-05 X -2.79147086 #> 10 2009-01-05 Y 0.36599370 ``` ## Statistical Transformations Even after getting the raw data in the needed format, we may need to transform certain variables further to meet requirements imposed by a statistical test. `{datawizard}` provides a rich collection of such functions for transforming variables (see Table 2). | Function | Operation | | :---------------- | :------------------------------------------- | | `standardize()` | to center and scale data | | `normalize()` | to scale variables to 0-1 range | | `adjust()` | to adjust data for effect of other variables | | `slide()` | to shift numeric value range | | `ranktransform()` | to convert numeric values to integer ranks | | ... | ... | Table: The table below lists a few key functions offered by `{datawizard}` for data transformations. To see the full list, see the package website: We will look at one example function that standardizes (i.e. centers and scales) data so that it can be expressed in terms of standard deviation: ```r d <- data.frame( a = c(-2, -1, 0, 1, 2), b = c(3, 4, 5, 6, 7) ) standardize(d, center = c(3, 4), scale = c(2, 4)) #> a b #> 1 -2.5 -0.25 #> 2 -2.0 0.00 #> 3 -1.5 0.25 #> 4 -1.0 0.50 #> 5 -0.5 0.75 ``` ## Summaries of Data Properties and Distributions The workhorse function to get a comprehensive summary of data properties is `describe_distribution()`, which combines a set of indices (e.g., measures of centrality, dispersion, range, skewness, kurtosis, etc.) computed by other functions in `{datawizard}`. ```r describe_distribution(mtcars) ``` \begin{tabular}[t]{lrrrrrrrrr} \toprule Variable & Mean & SD & IQR & Min & Max & Skewness & Kurtosis & n & n\_Missing\\ \midrule mpg & 20.09 & 6.03 & 7.53 & 10.4 & 33.9 & 0.67 & -0.02 & 32 & 0\\ cyl & 6.19 & 1.79 & 4.00 & 4.0 & 8.0 & -0.19 & -1.76 & 32 & 0\\ disp & 230.72 & 123.94 & 221.52 & 71.1 & 472.0 & 0.42 & -1.07 & 32 & 0\\ hp & 146.69 & 68.56 & 84.50 & 52.0 & 335.0 & 0.80 & 0.28 & 32 & 0\\ drat & 3.60 & 0.53 & 0.84 & 2.8 & 4.9 & 0.29 & -0.45 & 32 & 0\\ wt & 3.22 & 0.98 & 1.19 & 1.5 & 5.4 & 0.47 & 0.42 & 32 & 0\\ qsec & 17.85 & 1.79 & 2.02 & 14.5 & 22.9 & 0.41 & 0.86 & 32 & 0\\ vs & 0.44 & 0.50 & 1.00 & 0.0 & 1.0 & 0.26 & -2.06 & 32 & 0\\ am & 0.41 & 0.50 & 1.00 & 0.0 & 1.0 & 0.40 & -1.97 & 32 & 0\\ gear & 3.69 & 0.74 & 1.00 & 3.0 & 5.0 & 0.58 & -0.90 & 32 & 0\\ carb & 2.81 & 1.62 & 2.00 & 1.0 & 8.0 & 1.16 & 2.02 & 32 & 0\\ \bottomrule \end{tabular} # Licensing and Availability `{datawizard}` is licensed under the GNU General Public License (v3.0), with all source code openly developed and stored on GitHub (), along with a corresponding issue tracker for bug reporting and feature enhancements. In the spirit of honest and open science, we encourage requests, tips for fixes, feature updates, as well as general questions and concerns via direct interaction with contributors and developers. # Acknowledgments `{datawizard}` is part of the collaborative [*easystats*](https://easystats.github.io/easystats/) ecosystem. Thus, we thank the [members of easystats](https://github.com/orgs/easystats/people) as well as the users. # References ================================================ FILE: pkgdown/_pkgdown.yaml ================================================ url: https://easystats.github.io/datawizard/ template: bootstrap: 5 package: easystatstemplate reference: - title: Data Preparation desc: | Main functions for cleaning and preparing data contents: - data_to_long - data_to_wide - data_extract - data_filter - data_select - data_reorder - data_arrange - data_merge - data_partition - data_rotate - data_group - data_replicate - data_duplicated - data_unique - title: Data and Variable Transformations - subtitle: Statistical Transformations desc: | Functions for transforming variables contents: - data_modify - data_separate - data_unite - categorize - recode_into - recode_values - adjust - demean - ranktransform - rescale_weights - winsorize - subtitle: "Linear Transformers" desc: | Convenient functions for common linear transformations contents: - center - slide - standardize - standardize.default - reverse - rescale - normalize - unstandardize - makepredictcall.dw_transformer - subtitle: "Others" contents: - contr.deviation - title: Data Properties desc: | Functions to compute statistical summaries of data properties and distributions contents: - as.prop.table - data_codebook - data_summary - data_tabulate - data_peek - data_seek - means_by_group - contains("distribution") - kurtosis - smoothness - skewness - row_count - row_means - weighted_mean - mean_sd - title: Convert and Replace Data desc: | Helpers for data replacements contents: - assign_labels - labels_to_levels - contains("to_numeric") - to_factor - starts_with("replace_") - starts_with("convert_") - title: Import data desc: | Helpers for importing data contents: - data_read - title: Helpers for Data Preparation desc: | Primarily useful in the context of other 'easystats' packages contents: - reshape_ci - data_rename - data_addprefix - remove_empty - contains("rownames") - rowid_as_column - contains("colnames") - extract_column_names - data_restoretype - title: Helpers for Text Formatting desc: | Primarily useful for 'report' package contents: - starts_with("text_") - title: Visualization helpers desc: | Primarily useful in the context of other 'easystats' packages contents: - visualisation_recipe - title: Data desc: | Datasets useful for examples and tests contents: - efc - nhanes_sample articles: - title: Overview of vignettes navbar: ~ contents: - overview_of_vignettes - title: Data Preparation desc: | Articles explaining utility of 'datawizard' for data wrangling navbar: ~ contents: - tidyverse_translation - selection_syntax - title: Statistical Transformations desc: | Articles describing use of 'datawizard' functions for tranforming data to use in regression models navbar: ~ contents: - standardize_data ================================================ FILE: tests/testthat/_snaps/categorize.md ================================================ # categorize labelling ranged Code categorize(mtcars$mpg, "equal_length", n_groups = 5) Output [1] 3 3 3 3 2 2 1 3 3 2 2 2 2 2 1 1 1 5 5 5 3 2 2 1 2 4 4 5 2 2 1 3 --- Code categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") Output [1] [19.8,24.5) [19.8,24.5) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [7] [10.4,15.1) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [15.1,19.8) [13] [15.1,19.8) [15.1,19.8) [10.4,15.1) [10.4,15.1) [10.4,15.1) [29.2,33.9] [19] [29.2,33.9] [29.2,33.9] [19.8,24.5) [15.1,19.8) [15.1,19.8) [10.4,15.1) [25] [15.1,19.8) [24.5,29.2) [24.5,29.2) [29.2,33.9] [15.1,19.8) [15.1,19.8) [31] [10.4,15.1) [19.8,24.5) Levels: [10.4,15.1) [15.1,19.8) [19.8,24.5) [24.5,29.2) [29.2,33.9] --- Code categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") Output [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) [31] (10.4-15) (21-24.4) Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) # categorize breaks Code categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive") Output [1] (19.8,24.5] (19.8,24.5] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] [7] [10.4,15.1] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] (15.1,19.8] [13] (15.1,19.8] (15.1,19.8] [10.4,15.1] [10.4,15.1] [10.4,15.1] (29.2,33.9] [19] (29.2,33.9] (29.2,33.9] (19.8,24.5] (15.1,19.8] (15.1,19.8] [10.4,15.1] [25] (15.1,19.8] (24.5,29.2] (24.5,29.2] (29.2,33.9] (15.1,19.8] (15.1,19.8] [31] [10.4,15.1] (19.8,24.5] Levels: [10.4,15.1] (15.1,19.8] (19.8,24.5] (24.5,29.2] (29.2,33.9] ================================================ FILE: tests/testthat/_snaps/contr.deviation.md ================================================ # contr.deviation | snapshot Code solve(c.deviation) Output 4 6 8 Intercept 0.3333333 0.3333333 0.3333333 6 -1.0000000 1.0000000 0.0000000 8 -1.0000000 0.0000000 1.0000000 --- Code solve(mm) Output cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1 (Intercept) 0.3333333 0.0000000 0.3333333 0.0000000 0.3333333 0.0000000 cyl6 -1.0000000 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 cyl8 -1.0000000 0.0000000 0.0000000 0.0000000 1.0000000 0.0000000 am1 -0.3333333 0.3333333 -0.3333333 0.3333333 -0.3333333 0.3333333 cyl6:am1 1.0000000 -1.0000000 -1.0000000 1.0000000 0.0000000 0.0000000 cyl8:am1 1.0000000 -1.0000000 0.0000000 0.0000000 -1.0000000 1.0000000 ================================================ FILE: tests/testthat/_snaps/data_codebook.md ================================================ # data_codebook iris Code data_codebook(iris) Output iris (150 rows and 5 variables, 5 shown) ID | Name | Type | Missings | Values | N ---+--------------+-------------+----------+------------+----------- 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 ---+--------------+-------------+----------+------------+----------- 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 ---+--------------+-------------+----------+------------+----------- 3 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 ---+--------------+-------------+----------+------------+----------- 4 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 ---+--------------+-------------+----------+------------+----------- 5 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) | | | | versicolor | 50 (33.3%) | | | | virginica | 50 (33.3%) -------------------------------------------------------------------- # data_codebook iris, reordered Code data_codebook(iris[c(1, 2, 5, 3, 4)]) Output iris[c(1, 2, 5, 3, 4)] (150 rows and 5 variables, 5 shown) ID | Name | Type | Missings | Values | N ---+--------------+-------------+----------+------------+----------- 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 ---+--------------+-------------+----------+------------+----------- 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 ---+--------------+-------------+----------+------------+----------- 3 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) | | | | versicolor | 50 (33.3%) | | | | virginica | 50 (33.3%) ---+--------------+-------------+----------+------------+----------- 4 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 ---+--------------+-------------+----------+------------+----------- 5 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 -------------------------------------------------------------------- # data_codebook NaN and Inf Code data_codebook(d) Output d (9 rows and 1 variables, 1 shown) ID | Name | Type | Missings | Values | N ---+------+---------+-----------+--------+---------- 1 | x | numeric | 2 (22.2%) | 1 | 3 (42.9%) | | | | 2 | 1 (14.3%) | | | | 4 | 2 (28.6%) | | | | Inf | 1 (14.3%) ---------------------------------------------------- --- Code data_codebook(d) Output d (102 rows and 1 variables, 1 shown) ID | Name | Type | Missings | Values | N ---+------+---------+----------+---------+------------ 1 | x | numeric | 0 (0.0%) | [1, 15] | 102 (98.1%) | | | | Inf | 2 ( 1.9%) ------------------------------------------------------ --- Code data_codebook(d, range_at = 100) Output d (102 rows and 1 variables, 1 shown) ID | Name | Type | Missings | Values | N ---+------+---------+----------+--------+----------- 1 | x | numeric | 0 (0.0%) | 1 | 4 ( 4.0%) | | | | 2 | 5 ( 5.0%) | | | | 3 | 6 ( 6.0%) | | | | 4 | 5 ( 5.0%) | | | | 5 | 8 ( 8.0%) | | | | 6 | 10 (10.0%) | | | | 7 | 6 ( 6.0%) | | | | 8 | 3 ( 3.0%) | | | | 9 | 13 (13.0%) | | | | 10 | 7 ( 7.0%) | | | | (...) | ---------------------------------------------------- --- Code data_codebook(d, range_at = 100, max_values = 4) Output d (102 rows and 1 variables, 1 shown) ID | Name | Type | Missings | Values | N ---+------+---------+----------+--------+--------- 1 | x | numeric | 0 (0.0%) | 1 | 4 (4.0%) | | | | 2 | 5 (5.0%) | | | | 3 | 6 (6.0%) | | | | 4 | 5 (5.0%) | | | | (...) | -------------------------------------------------- # data_codebook, tinytable Code display(data_codebook(d), format = "tt") Output +----+------+---------+-----------+--------+-----------+ | ID | Name | Type | Missings | Values | N | +====+======+=========+===========+========+===========+ | 1 | x | numeric | 2 (22.2%) | 1 | 3 (42.9%) | +----+------+---------+-----------+--------+-----------+ | | | | | 2 | 1 (14.3%) | +----+------+---------+-----------+--------+-----------+ | | | | | 4 | 2 (28.6%) | +----+------+---------+-----------+--------+-----------+ | | | | | Inf | 1 (14.3%) | +----+------+---------+-----------+--------+-----------+ Table: d (9 rows and 1 variables, 1 shown) --- Code display(data_codebook(d), format = "tt") Output +----+------+---------+----------+---------+-------------+ | ID | Name | Type | Missings | Values | N | +====+======+=========+==========+=========+=============+ | 1 | x | numeric | 0 (0.0%) | [1, 15] | 102 (98.1%) | +----+------+---------+----------+---------+-------------+ | | | | | Inf | 2 (1.9%) | +----+------+---------+----------+---------+-------------+ Table: d (102 rows and 1 variables, 1 shown) --- Code display(data_codebook(d, range_at = 100), format = "tt") Output +----+------+---------+----------+--------+------------+ | ID | Name | Type | Missings | Values | N | +====+======+=========+==========+========+============+ | 1 | x | numeric | 0 (0.0%) | 1 | 4 (4.0%) | +----+------+---------+----------+--------+------------+ | | | | | 2 | 5 (5.0%) | +----+------+---------+----------+--------+------------+ | | | | | 3 | 6 (6.0%) | +----+------+---------+----------+--------+------------+ | | | | | 4 | 5 (5.0%) | +----+------+---------+----------+--------+------------+ | | | | | 5 | 8 (8.0%) | +----+------+---------+----------+--------+------------+ | | | | | 6 | 10 (10.0%) | +----+------+---------+----------+--------+------------+ | | | | | 7 | 6 (6.0%) | +----+------+---------+----------+--------+------------+ | | | | | 8 | 3 (3.0%) | +----+------+---------+----------+--------+------------+ | | | | | 9 | 13 (13.0%) | +----+------+---------+----------+--------+------------+ | | | | | 10 | 7 (7.0%) | +----+------+---------+----------+--------+------------+ | | | | | (...) | | +----+------+---------+----------+--------+------------+ Table: d (102 rows and 1 variables, 1 shown) --- Code display(data_codebook(d, range_at = 100, max_values = 4), format = "tt") Output +----+------+---------+----------+--------+----------+ | ID | Name | Type | Missings | Values | N | +====+======+=========+==========+========+==========+ | 1 | x | numeric | 0 (0.0%) | 1 | 4 (4.0%) | +----+------+---------+----------+--------+----------+ | | | | | 2 | 5 (5.0%) | +----+------+---------+----------+--------+----------+ | | | | | 3 | 6 (6.0%) | +----+------+---------+----------+--------+----------+ | | | | | 4 | 5 (5.0%) | +----+------+---------+----------+--------+----------+ | | | | | (...) | | +----+------+---------+----------+--------+----------+ Table: d (102 rows and 1 variables, 1 shown) --- Code display(data_codebook(iris[c(1, 2, 5, 3, 4)]), format = "tt") Output +----+--------------+-------------+----------+------------+------------+ | ID | Name | Type | Missings | Values | N | +====+==============+=============+==========+============+============+ | 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 | +----+--------------+-------------+----------+------------+------------+ | 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 | +----+--------------+-------------+----------+------------+------------+ | 3 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) | +----+--------------+-------------+----------+------------+------------+ | | | | | versicolor | 50 (33.3%) | +----+--------------+-------------+----------+------------+------------+ | | | | | virginica | 50 (33.3%) | +----+--------------+-------------+----------+------------+------------+ | 4 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 | +----+--------------+-------------+----------+------------+------------+ | 5 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 | +----+--------------+-------------+----------+------------+------------+ Table: iris[c(1, 2, 5, 3, 4)] (150 rows and 5 variables, 5 shown) # data_codebook iris, select Code data_codebook(iris, select = starts_with("Sepal")) Output iris (150 rows and 5 variables, 2 shown) ID | Name | Type | Missings | Values | N ---+--------------+---------+----------+------------+---- 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 ---+--------------+---------+----------+------------+---- 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 --------------------------------------------------------- # data_codebook iris, select, ID Code data_codebook(iris, select = starts_with("Petal")) Output iris (150 rows and 5 variables, 2 shown) ID | Name | Type | Missings | Values | N ---+--------------+---------+----------+------------+---- 3 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 ---+--------------+---------+----------+------------+---- 4 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 --------------------------------------------------------- # data_codebook efc Code print(data_codebook(efc), table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type | Missings | Values | Value Labels | N ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- 1 | c12hour | average number of hours of care per week | numeric | 2 (2.0%) | [5, 168] | | 98 ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) | | | | | 2 | female | 54 (54.0%) ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) | | | | | 2 | slightly dependent | 4 ( 4.1%) | | | | | 3 | moderately dependent | 28 (28.9%) | | | | | 4 | severely dependent | 63 (64.9%) ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) | | | | | 2 | intermediate level of education | 66 (73.3%) | | | | | 3 | high level of education | 16 (17.8%) ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 --------------------------------------------------------------------------------------------------------------------------------------------- --- Code print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type ---+----------+------------------------------------------+------------ 1 | c12hour | average number of hours of care per week | numeric ---+----------+------------------------------------------+------------ 2 | e16sex | elder's gender | numeric | | | ---+----------+------------------------------------------+------------ 3 | e42dep | elder's dependency | categorical | | | | | | | | | ---+----------+------------------------------------------+------------ 4 | c172code | carer's level of education | numeric | | | | | | ---+----------+------------------------------------------+------------ 5 | neg_c_7 | Negative impact with 7 items | numeric ---------------------------------------------------------------------- ID | Missings | Values | Value Labels | N ---+------------+----------+---------------------------------+----------- 1 | 2 (2.0%) | [5, 168] | | 98 ---+------------+----------+---------------------------------+----------- 2 | 0 (0.0%) | 1 | male | 46 (46.0%) | | 2 | female | 54 (54.0%) ---+------------+----------+---------------------------------+----------- 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) | | 2 | slightly dependent | 4 ( 4.1%) | | 3 | moderately dependent | 28 (28.9%) | | 4 | severely dependent | 63 (64.9%) ---+------------+----------+---------------------------------+----------- 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) | | 2 | intermediate level of education | 66 (73.3%) | | 3 | high level of education | 16 (17.8%) ---+------------+----------+---------------------------------+----------- 5 | 3 (3.0%) | [7, 28] | | 97 ------------------------------------------------------------------------- --- Code print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type ---+----------+------------------------------------------+------------ 1 | c12hour | average number of hours of care per week | numeric ---+----------+------------------------------------------+------------ 2 | e16sex | elder's gender | numeric ---+----------+------------------------------------------+------------ 3 | e42dep | elder's dependency | categorical ---+----------+------------------------------------------+------------ 4 | c172code | carer's level of education | numeric ---+----------+------------------------------------------+------------ 5 | neg_c_7 | Negative impact with 7 items | numeric ---------------------------------------------------------------------- ID | Missings | Values | Value Labels | N ---+------------+----------+---------------------------------+----------- 1 | 2 (2.0%) | [5, 168] | | 98 ---+------------+----------+---------------------------------+----------- 2 | 0 (0.0%) | 1 | male | 46 (46.0%) | | 2 | female | 54 (54.0%) ---+------------+----------+---------------------------------+----------- 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) | | 2 | slightly dependent | 4 ( 4.1%) | | 3 | moderately dependent | 28 (28.9%) | | 4 | severely dependent | 63 (64.9%) ---+------------+----------+---------------------------------+----------- 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) | | 2 | intermediate level of education | 66 (73.3%) | | 3 | high level of education | 16 (17.8%) ---+------------+----------+---------------------------------+----------- 5 | 3 (3.0%) | [7, 28] | | 97 ------------------------------------------------------------------------- # data_codebook efc, variable_label_width Code print(out, table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type | Missings | Values | Value Labels | N ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | [5, 168] | | 98 | | care per week | | | | | ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) | | | | | 2 | female | 54 (54.0%) ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) | | | | | 2 | slightly dependent | 4 ( 4.1%) | | | | | 3 | moderately dependent | 28 (28.9%) | | | | | 4 | severely dependent | 63 (64.9%) ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) | | | | | 2 | intermediate level of education | 66 (73.3%) | | | | | 3 | high level of education | 16 (17.8%) ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 --------------------------------------------------------------------------------------------------------------------------------- --- Code print(out, table_width = "auto", remove_duplicates = FALSE) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type | Missings ---+----------+------------------------------+-------------+----------- 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | | care per week | | ---+----------+------------------------------+-------------+----------- 2 | e16sex | elder's gender | numeric | 0 (0.0%) | | | | ---+----------+------------------------------+-------------+----------- 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | | | | | | | | | | | | ---+----------+------------------------------+-------------+----------- 4 | c172code | carer's level of education | numeric | 10 (10.0%) | | | | | | | | ---+----------+------------------------------+-------------+----------- 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) ----------------------------------------------------------------------- ID | Values | Value Labels | N ---+----------+---------------------------------+----------- 1 | [5, 168] | | 98 | | | ---+----------+---------------------------------+----------- 2 | 1 | male | 46 (46.0%) | 2 | female | 54 (54.0%) ---+----------+---------------------------------+----------- 3 | 1 | independent | 2 ( 2.1%) | 2 | slightly dependent | 4 ( 4.1%) | 3 | moderately dependent | 28 (28.9%) | 4 | severely dependent | 63 (64.9%) ---+----------+---------------------------------+----------- 4 | 1 | low level of education | 8 ( 8.9%) | 2 | intermediate level of education | 66 (73.3%) | 3 | high level of education | 16 (17.8%) ---+----------+---------------------------------+----------- 5 | [7, 28] | | 97 ------------------------------------------------------------ --- Code print(out, table_width = "auto", remove_duplicates = TRUE) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type | Missings ---+----------+------------------------------+-------------+----------- 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | | care per week | | ---+----------+------------------------------+-------------+----------- 2 | e16sex | elder's gender | numeric | 0 (0.0%) ---+----------+------------------------------+-------------+----------- 3 | e42dep | elder's dependency | categorical | 3 (3.0%) ---+----------+------------------------------+-------------+----------- 4 | c172code | carer's level of education | numeric | 10 (10.0%) ---+----------+------------------------------+-------------+----------- 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) ----------------------------------------------------------------------- ID | Values | Value Labels | N ---+----------+---------------------------------+----------- 1 | [5, 168] | | 98 ---+----------+---------------------------------+----------- 2 | 1 | male | 46 (46.0%) | 2 | female | 54 (54.0%) ---+----------+---------------------------------+----------- 3 | 1 | independent | 2 ( 2.1%) | 2 | slightly dependent | 4 ( 4.1%) | 3 | moderately dependent | 28 (28.9%) | 4 | severely dependent | 63 (64.9%) ---+----------+---------------------------------+----------- 4 | 1 | low level of education | 8 ( 8.9%) | 2 | intermediate level of education | 66 (73.3%) | 3 | high level of education | 16 (17.8%) ---+----------+---------------------------------+----------- 5 | [7, 28] | | 97 ------------------------------------------------------------ # data_codebook efc, value_label_width Code print(out, table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type | Missings | Values | Value Labels | N ---+----------+------------------------------+-------------+------------+----------+------------------+----------- 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | [5, 168] | | 98 | | care per week | | | | | ---+----------+------------------------------+-------------+------------+----------+------------------+----------- 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) | | | | | 2 | female | 54 (54.0%) ---+----------+------------------------------+-------------+------------+----------+------------------+----------- 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) | | | | | 2 | slightly... | 4 ( 4.1%) | | | | | 3 | moderately... | 28 (28.9%) | | | | | 4 | severely... | 63 (64.9%) ---+----------+------------------------------+-------------+------------+----------+------------------+----------- 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of... | 8 ( 8.9%) | | | | | 2 | intermediate... | 66 (73.3%) | | | | | 3 | high level of... | 16 (17.8%) ---+----------+------------------------------+-------------+------------+----------+------------------+----------- 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 ------------------------------------------------------------------------------------------------------------------ --- Code print(out, table_width = "auto", remove_duplicates = FALSE) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type | Missings ---+----------+------------------------------+-------------+----------- 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | | care per week | | ---+----------+------------------------------+-------------+----------- 2 | e16sex | elder's gender | numeric | 0 (0.0%) | | | | ---+----------+------------------------------+-------------+----------- 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | | | | | | | | | | | | ---+----------+------------------------------+-------------+----------- 4 | c172code | carer's level of education | numeric | 10 (10.0%) | | | | | | | | ---+----------+------------------------------+-------------+----------- 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) ----------------------------------------------------------------------- ID | Values | Value Labels | N ---+----------+------------------+----------- 1 | [5, 168] | | 98 | | | ---+----------+------------------+----------- 2 | 1 | male | 46 (46.0%) | 2 | female | 54 (54.0%) ---+----------+------------------+----------- 3 | 1 | independent | 2 ( 2.1%) | 2 | slightly... | 4 ( 4.1%) | 3 | moderately... | 28 (28.9%) | 4 | severely... | 63 (64.9%) ---+----------+------------------+----------- 4 | 1 | low level of... | 8 ( 8.9%) | 2 | intermediate... | 66 (73.3%) | 3 | high level of... | 16 (17.8%) ---+----------+------------------+----------- 5 | [7, 28] | | 97 --------------------------------------------- --- Code print(out, table_width = "auto", remove_duplicates = TRUE) Output efc (100 rows and 5 variables, 5 shown) ID | Name | Label | Type | Missings ---+----------+------------------------------+-------------+----------- 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | | care per week | | ---+----------+------------------------------+-------------+----------- 2 | e16sex | elder's gender | numeric | 0 (0.0%) ---+----------+------------------------------+-------------+----------- 3 | e42dep | elder's dependency | categorical | 3 (3.0%) ---+----------+------------------------------+-------------+----------- 4 | c172code | carer's level of education | numeric | 10 (10.0%) ---+----------+------------------------------+-------------+----------- 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) ----------------------------------------------------------------------- ID | Values | Value Labels | N ---+----------+------------------+----------- 1 | [5, 168] | | 98 ---+----------+------------------+----------- 2 | 1 | male | 46 (46.0%) | 2 | female | 54 (54.0%) ---+----------+------------------+----------- 3 | 1 | independent | 2 ( 2.1%) | 2 | slightly... | 4 ( 4.1%) | 3 | moderately... | 28 (28.9%) | 4 | severely... | 63 (64.9%) ---+----------+------------------+----------- 4 | 1 | low level of... | 8 ( 8.9%) | 2 | intermediate... | 66 (73.3%) | 3 | high level of... | 16 (17.8%) ---+----------+------------------+----------- 5 | [7, 28] | | 97 --------------------------------------------- # data_codebook truncated data Code data_codebook(d, max_values = 5) Output d (100 rows and 2 variables, 2 shown) ID | Name | Type | Missings | Values | N ---+------+-----------+----------+---------+--------- 1 | a | integer | 0 (0.0%) | [1, 15] | 100 ---+------+-----------+----------+---------+--------- 2 | b | character | 0 (0.0%) | a | 4 (4.0%) | | | | b | 3 (3.0%) | | | | c | 5 (5.0%) | | | | d | 4 (4.0%) | | | | e | 3 (3.0%) | | | | (...) | ----------------------------------------------------- # data_codebook mixed numeric lengths Code data_codebook(d) Output d (100 rows and 2 variables, 2 shown) ID | Name | Type | Missings | Values | N ---+------+---------+----------+---------+----------- 1 | a | integer | 0 (0.0%) | 1 | 28 (28.0%) | | | | 2 | 26 (26.0%) | | | | 3 | 29 (29.0%) | | | | 4 | 17 (17.0%) ---+------+---------+----------+---------+----------- 2 | b | integer | 0 (0.0%) | [5, 15] | 100 ----------------------------------------------------- # data_codebook mixed range_at Code data_codebook(d, range_at = 3) Output d (100 rows and 2 variables, 2 shown) ID | Name | Type | Missings | Values | N ---+------+---------+----------+---------+---- 1 | a | integer | 0 (0.0%) | [1, 4] | 100 ---+------+---------+----------+---------+---- 2 | b | integer | 0 (0.0%) | [5, 15] | 100 ---------------------------------------------- # data_codebook logicals Code data_codebook(d) Output d (100 rows and 3 variables, 3 shown) ID | Name | Type | Missings | Values | N ---+------+-----------+----------+---------+----------- 1 | a | integer | 0 (0.0%) | [1, 15] | 100 ---+------+-----------+----------+---------+----------- 2 | b | character | 0 (0.0%) | a | 26 (26.0%) | | | | b | 38 (38.0%) | | | | c | 36 (36.0%) ---+------+-----------+----------+---------+----------- 3 | c | logical | 0 (0.0%) | FALSE | 42 (42.0%) | | | | TRUE | 58 (58.0%) ------------------------------------------------------- # data_codebook labelled data exceptions Code data_codebook(d) Output d (100 rows and 3 variables, 3 shown) ID | Name | Type | Missings | Values | Value Labels | N ---+------+---------+------------+--------+--------------+----------- 1 | f1 | integer | 17 (17.0%) | 1 | One | 21 (25.3%) | | | | 2 | Two | 20 (24.1%) | | | | 3 | Three | 23 (27.7%) | | | | 5 | Five | 19 (22.9%) ---+------+---------+------------+--------+--------------+----------- 2 | f2 | integer | 0 (0.0%) | 1 | One | 25 (25.0%) | | | | 2 | Two | 20 (20.0%) | | | | 3 | Three | 14 (14.0%) | | | | 4 | 4 | 17 (17.0%) | | | | 5 | Five | 24 (24.0%) ---+------+---------+------------+--------+--------------+----------- 3 | f3 | integer | 0 (0.0%) | 1 | One | 21 (21.0%) | | | | 2 | Two | 24 (24.0%) | | | | 3 | Three | 16 (16.0%) | | | | 4 | Four | 14 (14.0%) | | | | 5 | Five | 25 (25.0%) --------------------------------------------------------------------- # data_codebook labelled data factors Code data_codebook(d) Output d (100 rows and 3 variables, 3 shown) ID | Name | Type | Missings | Values | Value Labels | N ---+------+-------------+----------+--------+--------------+----------- 1 | f1 | categorical | 0 (0.0%) | a | A | 35 (35.0%) | | | | b | Bee | 32 (32.0%) | | | | c | Cee | 33 (33.0%) ---+------+-------------+----------+--------+--------------+----------- 2 | f2 | categorical | 0 (0.0%) | a | A | 30 (30.0%) | | | | b | Bee | 38 (38.0%) | | | | c | Cee | 32 (32.0%) ---+------+-------------+----------+--------+--------------+----------- 3 | f3 | categorical | 0 (0.0%) | a | A | 23 (23.0%) | | | | b | Bee | 28 (28.0%) | | | | c | Cee | 49 (49.0%) ----------------------------------------------------------------------- # data_codebook works with numbers < 1 Code data_codebook(d) Output d (6 rows and 2 variables, 2 shown) ID | Name | Type | Missings | Values | N ---+------+---------+----------+--------+---------- 1 | a | numeric | 0 (0.0%) | 1 | 2 (33.3%) | | | | 2 | 2 (33.3%) | | | | 3 | 2 (33.3%) ---+------+---------+----------+--------+---------- 2 | b | numeric | 0 (0.0%) | 0 | 3 (50.0%) | | | | 1 | 2 (33.3%) | | | | 2 | 1 (16.7%) --------------------------------------------------- # data_codebook, big marks Code data_codebook(d) Output d (1,000,000 rows and 2 variables, 2 shown) ID | Name | Type | Missings | Values | N ---+------+-------------+----------+--------+---------------- 1 | f1 | categorical | 0 (0.0%) | a | 333,238 (33.3%) | | | | b | 332,910 (33.3%) | | | | c | 333,852 (33.4%) ---+------+-------------+----------+--------+---------------- 2 | f2 | categorical | 0 (0.0%) | 1 | 333,285 (33.3%) | | | | 2 | 333,358 (33.3%) | | | | 3 | 333,357 (33.3%) ------------------------------------------------------------- # data_codebook, tagged NA Code data_codebook(data.frame(x)) Output data.frame(x) (26 rows and 1 variables, 1 shown) ID | Name | Type | Missings | Values | Value Labels | N ---+------+---------+------------+--------+--------------+---------- 1 | x | numeric | 12 (46.2%) | 1 | Agreement | 4 (15.4%) | | | | 2 | 2 | 4 (15.4%) | | | | 3 | 3 | 4 (15.4%) | | | | 4 | Disagreement | 2 ( 7.7%) | | | | NA(a) | Refused | 4 (15.4%) | | | | NA(c) | First | 5 (19.2%) | | | | NA(z) | Not home | 3 (11.5%) -------------------------------------------------------------------- --- Code data_codebook(data.frame(x)) Output data.frame(x) (23 rows and 1 variables, 1 shown) ID | Name | Type | Missings | Values | Value Labels | N ---+------+---------+-----------+--------+--------------+---------- 1 | x | numeric | 9 (39.1%) | 1 | Agreement | 4 (17.4%) | | | | 2 | 2 | 4 (17.4%) | | | | 3 | 3 | 4 (17.4%) | | | | 4 | Disagreement | 2 ( 8.7%) | | | | NA(a) | Refused | 4 (17.4%) | | | | NA(c) | First | 5 (21.7%) ------------------------------------------------------------------- # data_codebook, negative label values #334 Code data_codebook(data.frame(x1, x2)) Output data.frame(x1, x2) (4 rows and 2 variables, 2 shown) ID | Name | Type | Missings | Values | Value Labels | N ---+------+---------+----------+--------+--------------+---------- 1 | x1 | integer | 0 (0.0%) | 1 | Agreement | 1 (25.0%) | | | | 2 | 2 | 1 (25.0%) | | | | 3 | 3 | 1 (25.0%) | | | | 4 | Disagreement | 1 (25.0%) ---+------+---------+----------+--------+--------------+---------- 2 | x2 | numeric | 0 (0.0%) | -9 | Missing | 1 (25.0%) | | | | 1 | Agreement | 1 (25.0%) | | | | 2 | 2 | 1 (25.0%) | | | | 3 | 3 | 1 (25.0%) ------------------------------------------------------------------ ================================================ FILE: tests/testthat/_snaps/data_modify.md ================================================ # data_modify message about recycling values Code head(data_modify(iris, Sepal.Width = 1)) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 1 1.4 0.2 setosa 2 4.9 1 1.4 0.2 setosa 3 4.7 1 1.3 0.2 setosa 4 4.6 1 1.5 0.2 setosa 5 5.0 1 1.4 0.2 setosa 6 5.4 1 1.7 0.4 setosa --- Code head(data_modify(iris, Sepal.Width = 1:2)) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 1 1.4 0.2 setosa 2 4.9 2 1.4 0.2 setosa 3 4.7 1 1.3 0.2 setosa 4 4.6 2 1.5 0.2 setosa 5 5.0 1 1.4 0.2 setosa 6 5.4 2 1.7 0.4 setosa --- Code head(data_modify(iris, Petal.Length = 1, Sepal.Width = 1)) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 1 1 0.2 setosa 2 4.9 1 1 0.2 setosa 3 4.7 1 1 0.2 setosa 4 4.6 1 1 0.2 setosa 5 5.0 1 1 0.2 setosa 6 5.4 1 1 0.4 setosa --- Code head(data_modify(iris, Petal.Length = 1, Sepal.Width = 1:2)) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 1 1 0.2 setosa 2 4.9 2 1 0.2 setosa 3 4.7 1 1 0.2 setosa 4 4.6 2 1 0.2 setosa 5 5.0 1 1 0.2 setosa 6 5.4 2 1 0.4 setosa --- Code head(data_modify(iris, Petal.Length = 2, Sepal.Width = 2)) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 2 2 0.2 setosa 2 4.9 2 2 0.2 setosa 3 4.7 2 2 0.2 setosa 4 4.6 2 2 0.2 setosa 5 5.0 2 2 0.2 setosa 6 5.4 2 2 0.4 setosa # data_modify message about modified variables Code head(data_modify(iris, Sepal.Width = 2 * Sepal.Width)) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 7.0 1.4 0.2 setosa 2 4.9 6.0 1.4 0.2 setosa 3 4.7 6.4 1.3 0.2 setosa 4 4.6 6.2 1.5 0.2 setosa 5 5.0 7.2 1.4 0.2 setosa 6 5.4 7.8 1.7 0.4 setosa --- Code head(data_modify(iris, Petal.Length = Sepal.Length, Sepal.Width = Petal.Width)) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 0.2 5.1 0.2 setosa 2 4.9 0.2 4.9 0.2 setosa 3 4.7 0.2 4.7 0.2 setosa 4 4.6 0.2 4.6 0.2 setosa 5 5.0 0.2 5.0 0.2 setosa 6 5.4 0.4 5.4 0.4 setosa # data_modify works with new expressions, different use cases same results Code print(head(out_complex)) Output Species sepwid seplen half_petal new_var new_num new_var2 new_num2 1 setosa 7.0 25.5 0.70 string 1 ho 4 2 setosa 6.0 24.5 0.70 string 2 ho 5 3 setosa 6.4 23.5 0.65 string 3 ho 6 4 setosa 6.2 23.0 0.75 string 4 ho 4 5 setosa 7.2 25.0 0.70 string 5 ho 5 6 setosa 7.8 27.0 0.85 string 1 ho 6 ================================================ FILE: tests/testthat/_snaps/data_partition.md ================================================ # data_partition works as expected Code data_partition(letters, seed = 123) Output $p_0.7 data .row_id 1 c 3 2 e 5 3 h 8 4 i 9 5 j 10 6 k 11 7 l 12 8 m 13 9 n 14 10 o 15 11 p 16 12 r 18 13 s 19 14 t 20 15 u 21 16 w 23 17 x 24 18 y 25 $test data .row_id 1 a 1 2 b 2 3 d 4 4 f 6 5 g 7 6 q 17 7 v 22 8 z 26 --- Code str(data_partition(iris, proportion = 0.7, seed = 123)) Output List of 2 $ p_0.7:'data.frame': 105 obs. of 6 variables: ..$ Sepal.Length: num [1:105] 4.6 5.4 4.6 5 4.4 4.9 4.8 4.8 4.3 5.8 ... ..$ Sepal.Width : num [1:105] 3.1 3.9 3.4 3.4 2.9 3.1 3.4 3 3 4 ... ..$ Petal.Length: num [1:105] 1.5 1.7 1.4 1.5 1.4 1.5 1.6 1.4 1.1 1.2 ... ..$ Petal.Width : num [1:105] 0.2 0.4 0.3 0.2 0.2 0.1 0.2 0.1 0.1 0.2 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:105] 4 6 7 8 9 10 12 13 14 15 ... $ test :'data.frame': 45 obs. of 6 variables: ..$ Sepal.Length: num [1:45] 5.1 4.9 4.7 5 5.4 5.1 5.7 5.2 5.2 5.2 ... ..$ Sepal.Width : num [1:45] 3.5 3 3.2 3.6 3.7 3.5 3.8 3.5 3.4 4.1 ... ..$ Petal.Length: num [1:45] 1.4 1.4 1.3 1.4 1.5 1.4 1.7 1.5 1.4 1.5 ... ..$ Petal.Width : num [1:45] 0.2 0.2 0.2 0.2 0.2 0.3 0.3 0.2 0.2 0.1 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:45] 1 2 3 5 11 18 19 28 29 33 ... --- Code str(data_partition(iris, proportion = c(0.2, 0.5), seed = 123)) Output List of 3 $ p_0.2:'data.frame': 30 obs. of 6 variables: ..$ Sepal.Length: num [1:30] 4.6 4.4 4.3 4.6 5 5 5.4 5 4.4 5 ... ..$ Sepal.Width : num [1:30] 3.4 2.9 3 3.6 3 3.4 3.4 3.5 3.2 3.3 ... ..$ Petal.Length: num [1:30] 1.4 1.4 1.1 1 1.6 1.6 1.5 1.3 1.3 1.4 ... ..$ Petal.Width : num [1:30] 0.3 0.2 0.1 0.2 0.2 0.4 0.4 0.3 0.2 0.2 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:30] 7 9 14 23 26 27 32 41 43 50 ... $ p_0.5:'data.frame': 75 obs. of 6 variables: ..$ Sepal.Length: num [1:75] 4.6 5.4 5 4.9 4.8 5.8 5.7 5.4 5.1 5.7 ... ..$ Sepal.Width : num [1:75] 3.1 3.9 3.4 3.1 3.4 4 4.4 3.9 3.5 3.8 ... ..$ Petal.Length: num [1:75] 1.5 1.7 1.5 1.5 1.6 1.2 1.5 1.3 1.4 1.7 ... ..$ Petal.Width : num [1:75] 0.2 0.4 0.2 0.1 0.2 0.2 0.4 0.4 0.3 0.3 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:75] 4 6 8 10 12 15 16 17 18 19 ... $ test :'data.frame': 45 obs. of 6 variables: ..$ Sepal.Length: num [1:45] 5.1 4.9 4.7 5 5.4 4.8 5.4 5.1 5.2 4.9 ... ..$ Sepal.Width : num [1:45] 3.5 3 3.2 3.6 3.7 3 3.4 3.7 4.1 3.1 ... ..$ Petal.Length: num [1:45] 1.4 1.4 1.3 1.4 1.5 1.4 1.7 1.5 1.5 1.5 ... ..$ Petal.Width : num [1:45] 0.2 0.2 0.2 0.2 0.2 0.1 0.2 0.4 0.1 0.2 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:45] 1 2 3 5 11 13 21 22 33 35 ... --- Code str(data_partition(iris, proportion = 0.7, by = "Species", seed = 123)) Output List of 2 $ p_0.7:'data.frame': 105 obs. of 6 variables: ..$ Sepal.Length: num [1:105] 4.7 4.6 5 4.6 5 4.4 4.9 5.4 4.8 4.8 ... ..$ Sepal.Width : num [1:105] 3.2 3.1 3.6 3.4 3.4 2.9 3.1 3.7 3.4 3 ... ..$ Petal.Length: num [1:105] 1.3 1.5 1.4 1.4 1.5 1.4 1.5 1.5 1.6 1.4 ... ..$ Petal.Width : num [1:105] 0.2 0.2 0.2 0.3 0.2 0.2 0.1 0.2 0.2 0.1 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:105] 3 4 5 7 8 9 10 11 12 13 ... $ test :'data.frame': 45 obs. of 6 variables: ..$ Sepal.Length: num [1:45] 5.1 4.9 5.4 5.7 5.1 5.1 5.1 4.6 5.5 4.9 ... ..$ Sepal.Width : num [1:45] 3.5 3 3.9 4.4 3.5 3.8 3.7 3.6 4.2 3.1 ... ..$ Petal.Length: num [1:45] 1.4 1.4 1.7 1.5 1.4 1.5 1.5 1 1.4 1.5 ... ..$ Petal.Width : num [1:45] 0.2 0.2 0.4 0.4 0.3 0.3 0.4 0.2 0.2 0.2 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:45] 1 2 6 16 18 20 22 23 34 35 ... --- Code str(data_partition(iris, proportion = c(0.2, 0.5), by = "Species", seed = 123)) Output List of 3 $ p_0.2:'data.frame': 30 obs. of 6 variables: ..$ Sepal.Length: num [1:30] 4.7 4.3 5.8 4.8 5 4.8 5.5 4.5 4.4 4.6 ... ..$ Sepal.Width : num [1:30] 3.2 3 4 3.4 3 3.1 3.5 2.3 3.2 3.2 ... ..$ Petal.Length: num [1:30] 1.3 1.1 1.2 1.9 1.6 1.6 1.3 1.3 1.3 1.4 ... ..$ Petal.Width : num [1:30] 0.2 0.1 0.2 0.2 0.2 0.2 0.2 0.3 0.2 0.2 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:30] 3 14 15 25 26 31 37 42 43 48 ... $ p_0.5:'data.frame': 75 obs. of 6 variables: ..$ Sepal.Length: num [1:75] 5 5.4 5 4.4 4.9 5.4 4.8 4.8 5.7 5.4 ... ..$ Sepal.Width : num [1:75] 3.6 3.9 3.4 2.9 3.1 3.7 3.4 3 4.4 3.9 ... ..$ Petal.Length: num [1:75] 1.4 1.7 1.5 1.4 1.5 1.5 1.6 1.4 1.5 1.3 ... ..$ Petal.Width : num [1:75] 0.2 0.4 0.2 0.2 0.1 0.2 0.2 0.1 0.4 0.4 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:75] 5 6 8 9 10 11 12 13 16 17 ... $ test :'data.frame': 45 obs. of 6 variables: ..$ Sepal.Length: num [1:45] 5.1 4.9 4.6 4.6 5.7 5.4 4.6 5 5.2 4.7 ... ..$ Sepal.Width : num [1:45] 3.5 3 3.1 3.4 3.8 3.4 3.6 3.4 3.5 3.2 ... ..$ Petal.Length: num [1:45] 1.4 1.4 1.5 1.4 1.7 1.7 1 1.6 1.5 1.6 ... ..$ Petal.Width : num [1:45] 0.2 0.2 0.2 0.3 0.3 0.2 0.2 0.4 0.2 0.2 ... ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ..$ .row_id : int [1:45] 1 2 4 7 19 21 23 27 28 30 ... ================================================ FILE: tests/testthat/_snaps/data_peek.md ================================================ # data_peek snapshots look as expected Code data_peek(iris) Output Data frame with 150 rows and 5 variables Variable | Type | Values ----------------------------------------------------------------------- Sepal.Length | numeric | 5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, ... Sepal.Width | numeric | 3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, ... Petal.Length | numeric | 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, ... Petal.Width | numeric | 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, ... Species | factor | setosa, setosa, setosa, setosa, setosa, ... --- Code data_peek(iris, select = 1:3) Output Data frame with 150 rows and 5 variables Variable | Type | Values ----------------------------------------------------------------------- Sepal.Length | numeric | 5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, ... Sepal.Width | numeric | 3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, ... Petal.Length | numeric | 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, ... --- Code data_peek(iris, width = 130) Output Data frame with 150 rows and 5 variables Variable | Type | Values --------------------------------------------------------------------------------------------------------------------------------- Sepal.Length | numeric | 5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, ... Sepal.Width | numeric | 3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3, 3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, ... Petal.Length | numeric | 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, ... Petal.Width | numeric | 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, ... Species | factor | setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, ... ================================================ FILE: tests/testthat/_snaps/data_read.md ================================================ # data_read, convert many labels correctly Code data_tabulate(d$selv1) Output d$selv1 # total N=2413 valid N=2413 Value | N | Raw % | Valid % | Cumulative % ---------------------------------------------------+-----+-------+---------+------------- Vignette 1 weiblich (Gülsen E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 6.22 Vignette 2 weiblich (Gülsen E. Anwältin B) | 150 | 6.22 | 6.22 | 12.43 Vignette 3 weiblich (Monika E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 18.65 Vignette 4 weiblich (Monika E. Anwältin B) | 151 | 6.26 | 6.26 | 24.91 Vignette 5 männlich (Hasan E. Reinigungskraft B) | 151 | 6.26 | 6.26 | 31.16 Vignette 6 männlich (Hasan E. Anwalt B) | 153 | 6.34 | 6.34 | 37.51 Vignette 7 männlich (Martin E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 43.72 Vignette 8 männlich (Martin E. Anwalt B) | 150 | 6.22 | 6.22 | 49.94 Vignette 9 weiblich (Gülsen E. Reinigungskraft E) | 151 | 6.26 | 6.26 | 56.20 Vignette 10 weiblich (Gülsen E. Anwältin E) | 150 | 6.22 | 6.22 | 62.41 Vignette 11 weiblich (Monika E. Reinigungskraft E) | 150 | 6.22 | 6.22 | 68.63 Vignette 12 weiblich (Monika E. Anwältin E) | 151 | 6.26 | 6.26 | 74.89 Vignette 13 männlich (Hasan E. Reinigungskraft E) | 155 | 6.42 | 6.42 | 81.31 Vignette 14 männlich (Hasan E. Anwalt E) | 150 | 6.22 | 6.22 | 87.53 Vignette 15 männlich (Martin E. Reinigungskraft E) | 150 | 6.22 | 6.22 | 93.74 Vignette 16 männlich (Martin E. Anwalt E) | 151 | 6.26 | 6.26 | 100.00 | 0 | 0.00 | | --- Code data_tabulate(d$c12) Output Sind oder waren Sie schon einmal selbst von solchen Beschwerden betroffen? (d$c12) # total N=2413 valid N=2413 Value | N | Raw % | Valid % | Cumulative % -------------+------+-------+---------+------------- ja | 786 | 32.57 | 32.57 | 32.57 nein | 1616 | 66.97 | 66.97 | 99.54 keine Angabe | 11 | 0.46 | 0.46 | 100.00 | 0 | 0.00 | | --- Code data_tabulate(d$c12a) Output Haben Sie deswegen Behandlung(en) in Anspruch genommen? (d$c12a) # total N=2413 valid N=2413 Value | N | Raw % | Valid % | Cumulative % -------------+------+-------+---------+------------- Filter | 1627 | 67.43 | 67.43 | 67.43 ja | 500 | 20.72 | 20.72 | 88.15 nein | 285 | 11.81 | 11.81 | 99.96 keine Angabe | 1 | 0.04 | 0.04 | 100.00 | 0 | 0.00 | | --- Code data_tabulate(d$c12c) Output Wie sehr haben diese Behandlung(en) Ihre Beeinträchtigung durch die Beschwerden verbessert? (d$c12c) # total N=2413 valid N=2413 Value | N | Raw % | Valid % | Cumulative % --------------------------+------+-------+---------+------------- Filter | 1913 | 79.28 | 79.28 | 79.28 0 = keine | 34 | 1.41 | 1.41 | 80.69 1 | 2 | 0.08 | 0.08 | 80.77 2 | 11 | 0.46 | 0.46 | 81.23 3 | 14 | 0.58 | 0.58 | 81.81 4 | 19 | 0.79 | 0.79 | 82.59 5 | 61 | 2.53 | 2.53 | 85.12 6 | 42 | 1.74 | 1.74 | 86.86 7 | 63 | 2.61 | 2.61 | 89.47 8 | 97 | 4.02 | 4.02 | 93.49 9 | 53 | 2.20 | 2.20 | 95.69 10 = sehr starke | 99 | 4.10 | 4.10 | 99.79 weiß nicht / keine Angabe | 5 | 0.21 | 0.21 | 100.00 | 0 | 0.00 | | --- Code table(d$selv1) Output 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 150 150 150 151 151 153 150 150 151 150 150 151 155 150 150 151 --- Code table(d$c12) Output 1 2 99 786 1616 11 --- Code table(d$c12a) Output -2 1 2 99 1627 500 285 1 --- Code table(d$c12c) Output -2 0 1 2 3 4 5 6 7 8 9 10 99 1913 34 2 11 14 19 61 42 63 97 53 99 5 ================================================ FILE: tests/testthat/_snaps/data_rescale.md ================================================ # rescale works as expected Code head(rescale(iris, to = c(0, 1))) Message Variables of class `factor` can't be rescaled and remain unchanged. Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 0.22222222 0.6250000 0.06779661 0.04166667 setosa 2 0.16666667 0.4166667 0.06779661 0.04166667 setosa 3 0.11111111 0.5000000 0.05084746 0.04166667 setosa 4 0.08333333 0.4583333 0.08474576 0.04166667 setosa 5 0.19444444 0.6666667 0.06779661 0.04166667 setosa 6 0.30555556 0.7916667 0.11864407 0.12500000 setosa --- Code head(rescale(iris, to = c(0, 1), select = "Sepal.Length")) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 0.22222222 3.5 1.4 0.2 setosa 2 0.16666667 3.0 1.4 0.2 setosa 3 0.11111111 3.2 1.3 0.2 setosa 4 0.08333333 3.1 1.5 0.2 setosa 5 0.19444444 3.6 1.4 0.2 setosa 6 0.30555556 3.9 1.7 0.4 setosa --- Code head(rescale(iris, to = list(Sepal.Length = c(0, 1), Petal.Length = c(-1, 0)))) Message Variables of class `factor` can't be rescaled and remain unchanged. Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 0.22222222 3.5 -0.9322034 0.2 setosa 2 0.16666667 3.0 -0.9322034 0.2 setosa 3 0.11111111 3.2 -0.9491525 0.2 setosa 4 0.08333333 3.1 -0.9152542 0.2 setosa 5 0.19444444 3.6 -0.9322034 0.2 setosa 6 0.30555556 3.9 -0.8813559 0.4 setosa ================================================ FILE: tests/testthat/_snaps/data_seek.md ================================================ # data_seek - print Code data_seek(iris, "Length") Output index | column | labels ----------------------------------- 1 | Sepal.Length | Sepal.Length 3 | Petal.Length | Petal.Length --- Code data_seek(iris, "abc") Output No matches found. ================================================ FILE: tests/testthat/_snaps/data_separate.md ================================================ # data_separate: multiple columns Code out Output x_1 x_2 x_3 y_1 y_2 y_3 1 1 a 6 m n 99 2 2 b 7 77 f g 3 3 c 8 44 9 4 5 j --- Code out Output x_1 x_2 x_3 y_1 y_2 y_3 1 1 a 6 m n 99 2 2 b 7 d 77 f g 3 3 c 8 44 9 4 5 j --- Code out Output x_A x_B x_C y_A y_B y_C 1 1 a 6 m n 99 2 2 b 7 d 77 f g 3 3 c 8 44 9 4 5 j --- Code out Output x y x_A x_B x_C y_A y_B y_C 1 1.a.6 m.n.99 1 a 6 m n 99 2 2.b.7.d 77.f.g 2 b 7 d 77 f g 3 3.c.8 44.9 3 c 8 44 9 4 5.j 5 j --- Code out Output x_1 x_2 x_3 y_1 y_2 y_3 1 1 a 6 m n 99 2 b 7 d 77 f g 3 3 c 8 44 9 4 5 j --- Code out Output x y x_A x_B x_C y_A y_B y_C 1 1.a.6 m.n.99 1 a 6 m n 99 2 2.b.7.d 77.f.g 2 b 7 d 77 f g 3 3.c.8 44.9 3 c 8 44 9 9 4 5.j 5 j j --- Code out Output x y A B C 1 1.a.6 m.n.99 1m an 699 2 2.b.7.d 77.f.g 277 bf 7dg 3 3.c.8 44.9 344 c9 89 4 5.j 5NA jNA jNA --- Code out Output x y A B C 1 1.a.6 m.n.99 1m an 699 2 2.b.7.d 77.f.g 277 bf 7g 3 3.c.8 44.9 344 c9 8NA 4 5.j 5NA jNA NANA --- Code out Output x_1 x_2 x_3 y_1 y_2 y_3 1 1 a 6 m n 99 2 2 b 7 77 f g 3 3 c 8 44 44 9 4 5 5 j # data_separate: multiple columns, different lengths Code out Output A B C EE FF GG 1 1 a 6 m n 99 2 2 b 7 77 f g 3 3 c 8 44 9 4 5 j --- Code out Output A B C EE FF GG HH 1 1 a 6 m n 99 22 2 2 b 7 77 f g 34 3 3 c 8 44 9 4 5 j # data_separate: fail if invalid column selected Code data_separate(d_sep, guess_columns = "mode", select = NULL) Message Column `x` had different number of values after splitting. Variable was split into 3 columns. `x` returned more columns than expected after splitting. Right-most columns have been dropped. `x`returned fewer columns than expected after splitting. Right-most columns were filled with `NA`. Column `y` had different number of values after splitting. Variable was split into 3 columns. `y`returned fewer columns than expected after splitting. Right-most columns were filled with `NA`. Output x_1 x_2 x_3 y_1 y_2 y_3 1 1 a 6 m n 99 2 2 b 7 77 f g 3 3 c 8 44 9 4 5 j # data_separate: numeric column Code out Output y x_1 x_2 x_3 x_4 V1 m.n.99 15 435 352 3 V2 77.f.g 53 554 353 2 V3 44.9 12 342 422 V4 15 454 334 535 ================================================ FILE: tests/testthat/_snaps/data_summary.md ================================================ # data_summary, print Code print(out) Output am | gear | MW | SD ------------------------ 0 | 3 | 16.11 | 3.37 0 | 4 | 21.05 | 3.07 1 | 4 | 26.27 | 5.41 1 | 5 | 21.38 | 6.66 # data_summary, with NA Code print(out) Output c172code | MW ---------------- 1 | 87.12 2 | 94.05 3 | 75.00 | 47.80 --- Code print(out) Output c172code | MW ---------------- 1 | 87.12 2 | 94.05 3 | 75.00 --- Code print(out) Output e42dep | c172code | MW -------------------------- 1 | 2 | 17.00 2 | 2 | 34.25 3 | 1 | 39.50 3 | 2 | 52.44 3 | 3 | 52.00 3 | | 84.00 4 | 1 | 134.75 4 | 2 | 119.26 4 | 3 | 88.80 4 | | 43.29 | 2 | | | 7.00 # data_summary, bayestestR::ci Code out Output am | gear | mean_value | 95% CI --------------------------------------- 0 | 3 | 16.11 | [10.40, 21.46] 0 | 4 | 21.05 | [17.91, 24.28] 1 | 4 | 26.27 | [21.00, 33.64] 1 | 5 | 21.38 | [15.08, 29.96] ================================================ FILE: tests/testthat/_snaps/data_tabulate.md ================================================ # data_tabulate, tinytable Code display(data_tabulate(efc$c172code), format = "tt") Output +-------+----+-------+---------+--------------+ | Value | N | Raw % | Valid % | Cumulative % | +=======+====+=======+=========+==============+ | 1 | 8 | 8 | 8.89 | 8.89 | +-------+----+-------+---------+--------------+ | 2 | 66 | 66 | 73.33 | 82.22 | +-------+----+-------+---------+--------------+ | 3 | 16 | 16 | 17.78 | 100.00 | +-------+----+-------+---------+--------------+ | (NA) | 10 | 10 | (NA) | (NA) | +=======+====+=======+=========+==============+ | total N=100 valid N=90 | +=======+====+=======+=========+==============+ Table: carer's level of education (efc$c172code) (numeric) --- Code display(data_tabulate(efc, "c172code"), format = "tt") Output +-------+----+-------+---------+--------------+ | Value | N | Raw % | Valid % | Cumulative % | +=======+====+=======+=========+==============+ | 1 | 8 | 8 | 8.89 | 8.89 | +-------+----+-------+---------+--------------+ | 2 | 66 | 66 | 73.33 | 82.22 | +-------+----+-------+---------+--------------+ | 3 | 16 | 16 | 17.78 | 100.00 | +-------+----+-------+---------+--------------+ | (NA) | 10 | 10 | (NA) | (NA) | +=======+====+=======+=========+==============+ | total N=100 valid N=90 | +=======+====+=======+=========+==============+ Table: carer's level of education (c172code) (numeric) # data_tabulate, weights Code print(data_tabulate(efc$e42dep, weights = efc$weights)) Output elder's dependency (efc$e42dep) # total N=105 valid N=100 (weighted) Value | N | Raw % | Valid % | Cumulative % ------+----+-------+---------+------------- 1 | 3 | 2.86 | 3 | 3 2 | 4 | 3.81 | 4 | 7 3 | 26 | 24.76 | 26 | 33 4 | 67 | 63.81 | 67 | 100 | 5 | 4.76 | | --- Code print_md(data_tabulate(efc$e42dep, weights = efc$weights)) Output Table: elder's dependency (efc$e42dep) (categorical) |Value | N| Raw %| Valid %| Cumulative %| |:-----|--:|-----:|-------:|------------:| |1 | 3| 2.86| 3| 3| |2 | 4| 3.81| 4| 7| |3 | 26| 24.76| 26| 33| |4 | 67| 63.81| 67| 100| |(NA) | 5| 4.76| (NA)| (NA)| total N=105 valid N=100 (weighted) --- Code display(data_tabulate(efc$e42dep, weights = efc$weights)) Output Table: elder's dependency (efc$e42dep) (categorical) |Value | N| Raw %| Valid %| Cumulative %| |:-----|--:|-----:|-------:|------------:| |1 | 3| 2.86| 3| 3| |2 | 4| 3.81| 4| 7| |3 | 26| 24.76| 26| 33| |4 | 67| 63.81| 67| 100| |(NA) | 5| 4.76| (NA)| (NA)| total N=105 valid N=100 (weighted) --- Code print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$ weights)) Output # Frequency Table (weighted) Variable | Value | N | Raw % | Valid % | Cumulative % ---------+-------+----+-------+---------+------------- e42dep | 1 | 3 | 2.86 | 3 | 3 | 2 | 4 | 3.81 | 4 | 7 | 3 | 26 | 24.76 | 26 | 33 | 4 | 67 | 63.81 | 67 | 100 | | 5 | 4.76 | | ---------+-------+----+-------+---------+------------- e16sex | 1 | 50 | 47.62 | 47.62 | 47.62 | 2 | 55 | 52.38 | 52.38 | 100.00 | | 0 | 0.00 | | ------------------------------------------------------ --- Code print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights)) Output Table: Frequency Table (weighted) |Variable | Value| N| Raw %| Valid %| Cumulative %| |:--------|-----:|--:|-----:|-------:|------------:| |e42dep | 1| 3| 2.86| 3| 3| | | 2| 4| 3.81| 4| 7| | | 3| 26| 24.76| 26| 33| | | 4| 67| 63.81| 67| 100| | | (NA)| 5| 4.76| (NA)| (NA)| | | | | | | | |e16sex | 1| 50| 47.62| 47.62| 47.62| | | 2| 55| 52.38| 52.38| 100.00| | | (NA)| 0| 0.00| (NA)| (NA)| | | | | | | | --- Code display(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights)) Output Table: Frequency Table (weighted) |Variable | Value| N| Raw %| Valid %| Cumulative %| |:--------|-----:|--:|-----:|-------:|------------:| |e42dep | 1| 3| 2.86| 3| 3| | | 2| 4| 3.81| 4| 7| | | 3| 26| 24.76| 26| 33| | | 4| 67| 63.81| 67| 100| | | (NA)| 5| 4.76| (NA)| (NA)| | | | | | | | |e16sex | 1| 50| 47.62| 47.62| 47.62| | | 2| 55| 52.38| 52.38| 100.00| | | (NA)| 0| 0.00| (NA)| (NA)| | | | | | | | # data_tabulate print Code data_tabulate(efc$e42dep) Output elder's dependency (efc$e42dep) # total N=100 valid N=97 Value | N | Raw % | Valid % | Cumulative % ------+----+-------+---------+------------- 1 | 2 | 2 | 2.06 | 2.06 2 | 4 | 4 | 4.12 | 6.19 3 | 28 | 28 | 28.87 | 35.05 4 | 63 | 63 | 64.95 | 100.00 | 3 | 3 | | # data_tabulate print multiple Code data_tabulate(efc, c("c172code", "e16sex")) Output carer's level of education (c172code) # total N=100 valid N=90 Value | N | Raw % | Valid % | Cumulative % ------+----+-------+---------+------------- 1 | 8 | 8 | 8.89 | 8.89 2 | 66 | 66 | 73.33 | 82.22 3 | 16 | 16 | 17.78 | 100.00 | 10 | 10 | | elder's gender (e16sex) # total N=100 valid N=100 Value | N | Raw % | Valid % | Cumulative % ------+----+-------+---------+------------- 1 | 46 | 46 | 46 | 46 2 | 54 | 54 | 54 | 100 | 0 | 0 | | # data_tabulate big numbers Code data_tabulate(x) Output x # total N=10,000,000 valid N=10,000,000 Value | N | Raw % | Valid % | Cumulative % ------+-----------+-------+---------+------------- 1 | 1,998,318 | 19.98 | 19.98 | 19.98 2 | 1,998,338 | 19.98 | 19.98 | 39.97 3 | 2,001,814 | 20.02 | 20.02 | 59.98 4 | 1,999,423 | 19.99 | 19.99 | 79.98 5 | 2,002,107 | 20.02 | 20.02 | 100.00 | 0 | 0.00 | | --- Code print(data_tabulate(x), big_mark = "-") Output x # total N=10-000-000 valid N=10-000-000 Value | N | Raw % | Valid % | Cumulative % ------+-----------+-------+---------+------------- 1 | 1-998-318 | 19.98 | 19.98 | 19.98 2 | 1-998-338 | 19.98 | 19.98 | 39.97 3 | 2-001-814 | 20.02 | 20.02 | 59.98 4 | 1-999-423 | 19.99 | 19.99 | 79.98 5 | 2-002-107 | 20.02 | 20.02 | 100.00 | 0 | 0.00 | | --- Code print(data_tabulate(x), big_mark = "") Output x # total N=10000000 valid N=10000000 Value | N | Raw % | Valid % | Cumulative % ------+---------+-------+---------+------------- 1 | 1998318 | 19.98 | 19.98 | 19.98 2 | 1998338 | 19.98 | 19.98 | 39.97 3 | 2001814 | 20.02 | 20.02 | 59.98 4 | 1999423 | 19.99 | 19.99 | 79.98 5 | 2002107 | 20.02 | 20.02 | 100.00 | 0 | 0.00 | | # data_tabulate print multiple, collapse Code data_tabulate(efc, c("c172code", "e16sex"), collapse = TRUE) Output # Frequency Table Variable | Value | N | Raw % | Valid % | Cumulative % ---------+-------+----+-------+---------+------------- c172code | 1 | 8 | 8 | 8.89 | 8.89 | 2 | 66 | 66 | 73.33 | 82.22 | 3 | 16 | 16 | 17.78 | 100.00 | | 10 | 10 | | ---------+-------+----+-------+---------+------------- e16sex | 1 | 46 | 46 | 46 | 46 | 2 | 54 | 54 | 54 | 100 | | 0 | 0 | | ------------------------------------------------------ # data_tabulate print grouped data Code data_tabulate(poorman::group_by(efc, e16sex), "c172code") Output carer's level of education (c172code) Grouped by e16sex (1) # total N=46 valid N=41 Value | N | Raw % | Valid % | Cumulative % ------+----+-------+---------+------------- 1 | 5 | 10.87 | 12.20 | 12.20 2 | 32 | 69.57 | 78.05 | 90.24 3 | 4 | 8.70 | 9.76 | 100.00 | 5 | 10.87 | | carer's level of education (c172code) Grouped by e16sex (2) # total N=54 valid N=49 Value | N | Raw % | Valid % | Cumulative % ------+----+-------+---------+------------- 1 | 3 | 5.56 | 6.12 | 6.12 2 | 34 | 62.96 | 69.39 | 75.51 3 | 12 | 22.22 | 24.49 | 100.00 | 5 | 9.26 | | # data_tabulate print, collapse groups Code data_tabulate(poorman::group_by(efc, e16sex), "c172code", collapse = TRUE) Output # Frequency Table Variable | Group | Value | N | Raw % | Valid % | Cumulative % ---------+------------+-------+----+-------+---------+------------- c172code | e16sex (1) | 1 | 5 | 10.87 | 12.20 | 12.20 | | 2 | 32 | 69.57 | 78.05 | 90.24 | | 3 | 4 | 8.70 | 9.76 | 100.00 | | | 5 | 10.87 | | ---------+------------+-------+----+-------+---------+------------- c172code | e16sex (2) | 1 | 3 | 5.56 | 6.12 | 6.12 | | 2 | 34 | 62.96 | 69.39 | 75.51 | | 3 | 12 | 22.22 | 24.49 | 100.00 | | | 5 | 9.26 | | ------------------------------------------------------------------- # data_tabulate print, collapse groups, drop levels Code data_tabulate(poorman::group_by(efc, e16sex), "e42dep", collapse = TRUE, drop_levels = TRUE) Output # Frequency Table Variable | Group | Value | N | Raw % | Valid % | Cumulative % ---------+------------+-------+----+-------+---------+------------- e42dep | e16sex (1) | 1 | 2 | 4.35 | 4.44 | 4.44 | | 2 | 2 | 4.35 | 4.44 | 8.89 | | 3 | 8 | 17.39 | 17.78 | 26.67 | | 4 | 33 | 71.74 | 73.33 | 100.00 | | | 1 | 2.17 | | ---------+------------+-------+----+-------+---------+------------- e42dep | e16sex (2) | 2 | 2 | 3.70 | 3.85 | 3.85 | | 3 | 20 | 37.04 | 38.46 | 42.31 | | 4 | 30 | 55.56 | 57.69 | 100.00 | | | 2 | 3.70 | | ------------------------------------------------------------------- # data_tabulate, cross tables Code print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")) Output efc$c172code | male | female | | Total -------------+------------+------------+----------+------ 1 | 5 (5.0%) | 2 (2.0%) | 1 (1.0%) | 8 2 | 31 (31.0%) | 33 (33.0%) | 2 (2.0%) | 66 3 | 4 (4.0%) | 11 (11.0%) | 1 (1.0%) | 16 | 5 (5.0%) | 4 (4.0%) | 1 (1.0%) | 10 -------------+------------+------------+----------+------ Total | 45 | 50 | 5 | 100 --- Code print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE)) Output efc$c172code | male | female | Total -------------+------------+------------+------ 1 | 5 (5.8%) | 2 (2.3%) | 7 2 | 31 (36.0%) | 33 (38.4%) | 64 3 | 4 (4.7%) | 11 (12.8%) | 15 -------------+------------+------------+------ Total | 40 | 46 | 86 --- Code print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)) Output efc$c172code | male | female | | Total -------------+------------+------------+----------+------ 1 | 5 (4.8%) | 3 (2.9%) | 2 (1.9%) | 10 2 | 32 (30.5%) | 32 (30.5%) | 3 (2.9%) | 67 3 | 3 (2.9%) | 11 (10.5%) | 1 (1.0%) | 15 | 8 (7.6%) | 5 (4.8%) | 1 (1.0%) | 14 -------------+------------+------------+----------+------ Total | 48 | 51 | 7 | 105 --- Code print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights)) Output efc$c172code | male | female | Total -------------+------------+------------+------ 1 | 5 (5.8%) | 3 (3.5%) | 8 2 | 32 (37.2%) | 32 (37.2%) | 64 3 | 3 (3.5%) | 11 (12.8%) | 14 -------------+------------+------------+------ Total | 40 | 46 | 86 --- Code print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")) Output c172code | male | female | | Total ---------+------------+------------+-----------+------ 1 | 5 (62.5%) | 2 (25.0%) | 1 (12.5%) | 8 2 | 31 (47.0%) | 33 (50.0%) | 2 (3.0%) | 66 3 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 | 5 (50.0%) | 4 (40.0%) | 1 (10.0%) | 10 ---------+------------+------------+-----------+------ Total | 45 | 50 | 5 | 100 --- Code print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE)) Output c172code | male | female | Total ---------+------------+------------+------ 1 | 5 (71.4%) | 2 (28.6%) | 7 2 | 31 (48.4%) | 33 (51.6%) | 64 3 | 4 (26.7%) | 11 (73.3%) | 15 ---------+------------+------------+------ Total | 40 | 46 | 86 --- Code print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights)) Output c172code | male | female | | Total ---------+------------+------------+-----------+------ 1 | 5 (50.0%) | 3 (30.0%) | 2 (20.0%) | 10 2 | 32 (47.8%) | 32 (47.8%) | 3 (4.5%) | 67 3 | 3 (20.0%) | 11 (73.3%) | 1 (6.7%) | 15 | 8 (57.1%) | 5 (35.7%) | 1 (7.1%) | 14 ---------+------------+------------+-----------+------ Total | 48 | 51 | 7 | 105 --- Code print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights)) Output c172code | male | female | Total ---------+------------+------------+------ 1 | 5 (62.5%) | 3 (37.5%) | 8 2 | 32 (50.0%) | 32 (50.0%) | 64 3 | 3 (21.4%) | 11 (78.6%) | 14 ---------+------------+------------+------ Total | 40 | 46 | 86 --- Code print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")) Output c172code | male | female | | Total ---------+------------+------------+-----------+------ 1 | 5 (11.1%) | 2 (4.0%) | 1 (20.0%) | 8 2 | 31 (68.9%) | 33 (66.0%) | 2 (40.0%) | 66 3 | 4 (8.9%) | 11 (22.0%) | 1 (20.0%) | 16 | 5 (11.1%) | 4 (8.0%) | 1 (20.0%) | 10 ---------+------------+------------+-----------+------ Total | 45 | 50 | 5 | 100 --- Code print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE)) Output c172code | male | female | Total ---------+------------+------------+------ 1 | 5 (12.5%) | 2 (4.3%) | 7 2 | 31 (77.5%) | 33 (71.7%) | 64 3 | 4 (10.0%) | 11 (23.9%) | 15 ---------+------------+------------+------ Total | 40 | 46 | 86 --- Code print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", weights = "weights")) Output c172code | male | female | | Total ---------+------------+------------+-----------+------ 1 | 5 (10.4%) | 3 (5.9%) | 2 (28.6%) | 10 2 | 32 (66.7%) | 32 (62.7%) | 3 (42.9%) | 67 3 | 3 (6.2%) | 11 (21.6%) | 1 (14.3%) | 15 | 8 (16.7%) | 5 (9.8%) | 1 (14.3%) | 14 ---------+------------+------------+-----------+------ Total | 48 | 51 | 7 | 105 --- Code print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE, weights = "weights")) Output c172code | male | female | Total ---------+------------+------------+------ 1 | 5 (12.5%) | 3 (6.5%) | 8 2 | 32 (80.0%) | 32 (69.6%) | 64 3 | 3 (7.5%) | 11 (23.9%) | 14 ---------+------------+------------+------ Total | 40 | 46 | 86 --- Code print(data_tabulate(efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row")) Output Variable | Value | male | female | | Total ---------+-------+-------------+------------+-----------+------ c172code | 1 | 5 (62.5%) | 2 (25.0%) | 1 (12.5%) | 8 c172code | 2 | 31 (47.0%) | 33 (50.0%) | 2 (3.0%) | 66 c172code | 3 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 c172code | | 5 (50.0%) | 4 (40.0%) | 1 (10.0%) | 10 e42dep | 1 | 2 (100.0%) | 0 (0.0%) | 0 (0.0%) | 2 e42dep | 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 e42dep | 3 | 8 (28.6%) | 18 (64.3%) | 2 (7.1%) | 28 e42dep | 4 | 32 (50.8%) | 28 (44.4%) | 3 (4.8%) | 63 e42dep | | 1 (33.3%) | 2 (66.7%) | 0 (0.0%) | 3 # data_tabulate, cross tables, tinytable Code display(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"), format = "tt") Output +--------------+------------+------------+----------+-------+ | efc$c172code | male | female | (NA) | Total | +==============+============+============+==========+=======+ | 1 | 5 (5.0%) | 2 (2.0%) | 1 (1.0%) | 8 | +--------------+------------+------------+----------+-------+ | 2 | 31 (31.0%) | 33 (33.0%) | 2 (2.0%) | 66 | +--------------+------------+------------+----------+-------+ | 3 | 4 (4.0%) | 11 (11.0%) | 1 (1.0%) | 16 | +--------------+------------+------------+----------+-------+ | (NA) | 5 (5.0%) | 4 (4.0%) | 1 (1.0%) | 10 | +--------------+------------+------------+----------+-------+ | Total | 45 | 50 | 5 | 100 | +--------------+------------+------------+----------+-------+ --- Code display(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE), format = "tt") Output +--------------+------------+------------+-------+ | efc$c172code | male | female | Total | +==============+============+============+=======+ | 1 | 5 (5.8%) | 2 (2.3%) | 7 | +--------------+------------+------------+-------+ | 2 | 31 (36.0%) | 33 (38.4%) | 64 | +--------------+------------+------------+-------+ | 3 | 4 (4.7%) | 11 (12.8%) | 15 | +--------------+------------+------------+-------+ | Total | 40 | 46 | 86 | +--------------+------------+------------+-------+ --- Code display(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights), format = "tt") Output +--------------+------------+------------+----------+-------+ | efc$c172code | male | female | (NA) | Total | +==============+============+============+==========+=======+ | 1 | 5 (4.8%) | 3 (2.9%) | 2 (1.9%) | 10 | +--------------+------------+------------+----------+-------+ | 2 | 32 (30.5%) | 32 (30.5%) | 3 (2.9%) | 67 | +--------------+------------+------------+----------+-------+ | 3 | 3 (2.9%) | 11 (10.5%) | 1 (1.0%) | 15 | +--------------+------------+------------+----------+-------+ | (NA) | 8 (7.6%) | 5 (4.8%) | 1 (1.0%) | 14 | +--------------+------------+------------+----------+-------+ | Total | 48 | 51 | 7 | 105 | +--------------+------------+------------+----------+-------+ --- Code display(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights), format = "tt") Output +--------------+------------+------------+-------+ | efc$c172code | male | female | Total | +==============+============+============+=======+ | 1 | 5 (5.8%) | 3 (3.5%) | 8 | +--------------+------------+------------+-------+ | 2 | 32 (37.2%) | 32 (37.2%) | 64 | +--------------+------------+------------+-------+ | 3 | 3 (3.5%) | 11 (12.8%) | 14 | +--------------+------------+------------+-------+ | Total | 40 | 46 | 86 | +--------------+------------+------------+-------+ --- Code display(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"), format = "tt") Output +----------+------------+------------+-----------+-------+ | c172code | male | female | (NA) | Total | +==========+============+============+===========+=======+ | 1 | 5 (62.5%) | 2 (25.0%) | 1 (12.5%) | 8 | +----------+------------+------------+-----------+-------+ | 2 | 31 (47.0%) | 33 (50.0%) | 2 (3.0%) | 66 | +----------+------------+------------+-----------+-------+ | 3 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 | +----------+------------+------------+-----------+-------+ | (NA) | 5 (50.0%) | 4 (40.0%) | 1 (10.0%) | 10 | +----------+------------+------------+-----------+-------+ | Total | 45 | 50 | 5 | 100 | +----------+------------+------------+-----------+-------+ --- Code display(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights), format = "tt") Output +----------+------------+------------+-------+ | c172code | male | female | Total | +==========+============+============+=======+ | 1 | 5 (62.5%) | 3 (37.5%) | 8 | +----------+------------+------------+-------+ | 2 | 32 (50.0%) | 32 (50.0%) | 64 | +----------+------------+------------+-------+ | 3 | 3 (21.4%) | 11 (78.6%) | 14 | +----------+------------+------------+-------+ | Total | 40 | 46 | 86 | +----------+------------+------------+-------+ # data_tabulate, cross tables, grouped df Code print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row")) Output Grouped by e42dep (1) Variable | Value | male | female | | Total ---------+-------+------------+--------+----------+------ c172code | 2 | 2 (100.0%) | | 0 (0.0%) | 2 | | 0 (0.0%) | | 0 (0.0%) | 0 Grouped by e42dep (2) Variable | Value | male | female | | Total ---------+-------+-----------+-----------+----------+------ c172code | 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 | | 0 (0.0%) | 0 (0.0%) | 0 (0.0%) | 0 Grouped by e42dep (3) Variable | Value | male | female | | Total ---------+-------+-----------+------------+-----------+------ c172code | 1 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 | 2 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 | 3 | 1 (16.7%) | 5 (83.3%) | 0 (0.0%) | 6 | | 1 (50.0%) | 0 (0.0%) | 1 (50.0%) | 2 Grouped by e42dep (4) Variable | Value | male | female | | Total ---------+-------+------------+------------+-----------+------ c172code | 1 | 3 (75.0%) | 0 (0.0%) | 1 (25.0%) | 4 | 2 | 23 (54.8%) | 18 (42.9%) | 1 (2.4%) | 42 | 3 | 3 (30.0%) | 6 (60.0%) | 1 (10.0%) | 10 | | 3 (42.9%) | 4 (57.1%) | 0 (0.0%) | 7 Grouped by e42dep (NA) Variable | Value | male | female | | Total ---------+-------+------------+------------+----------+------ c172code | 2 | 0 (0.0%) | 2 (100.0%) | 0 (0.0%) | 2 | | 1 (100.0%) | 0 (0.0%) | 0 (0.0%) | 1 # data_tabulate, cross tables, print/format works Code print(x) Output Variable | Value | 3 | 4 | 5 | | Total ---------+-------+----+---+---+------+------ cyl | 4 | 1 | 8 | 2 | 0 | 11 cyl | 6 | 2 | 4 | 1 | 0 | 7 cyl | 8 | 12 | 0 | 2 | 0 | 14 cyl | | 0 | 0 | 0 | 0 | 0 am | 0 | 15 | 4 | 0 | 0 | 19 am | 1 | 0 | 8 | 5 | 0 | 13 am | | 0 | 0 | 0 | 0 | 0 # data_tabulate, cross tables, markdown Code print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")) Output |efc$c172code | male| female| (NA) | Total| |:------------|----------:|----------:|:--------|-----:| |1 | 5 (5.0%)| 2 (2.0%)|1 (1.0%) | 8| |2 | 31 (31.0%)| 33 (33.0%)|2 (2.0%) | 66| |3 | 4 (4.0%)| 11 (11.0%)|1 (1.0%) | 16| |(NA) | 5 (5.0%)| 4 (4.0%)|1 (1.0%) | 10| | | | | | | |Total | 45| 50| 5 | 100| --- Code print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE)) Output |efc$c172code | male| female| Total| |:------------|----------:|----------:|-----:| |1 | 5 (5.8%)| 2 (2.3%)| 7| |2 | 31 (36.0%)| 33 (38.4%)| 64| |3 | 4 (4.7%)| 11 (12.8%)| 15| | | | | | |Total | 40| 46| 86| --- Code print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)) Output |efc$c172code | male| female| (NA) | Total| |:------------|----------:|----------:|:--------|-----:| |1 | 5 (4.8%)| 3 (2.9%)|2 (1.9%) | 10| |2 | 32 (30.5%)| 32 (30.5%)|3 (2.9%) | 67| |3 | 3 (2.9%)| 11 (10.5%)|1 (1.0%) | 15| |(NA) | 8 (7.6%)| 5 (4.8%)|1 (1.0%) | 14| | | | | | | |Total | 48| 51| 7 | 105| --- Code print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights)) Output |efc$c172code | male| female| Total| |:------------|----------:|----------:|-----:| |1 | 5 (5.8%)| 3 (3.5%)| 8| |2 | 32 (37.2%)| 32 (37.2%)| 64| |3 | 3 (3.5%)| 11 (12.8%)| 14| | | | | | |Total | 40| 46| 86| --- Code print_md(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE, weights = "weights")) Output |c172code | male| female| Total| |:--------|----------:|----------:|-----:| |1 | 5 (12.5%)| 3 (6.5%)| 8| |2 | 32 (80.0%)| 32 (69.6%)| 64| |3 | 3 (7.5%)| 11 (23.9%)| 14| | | | | | |Total | 40| 46| 86| --- Code print_md(data_tabulate(efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row")) Output |Variable | Value| male| female| (NA) | Total| |:--------|-----:|-----------:|----------:|:---------|-----:| |c172code | 1| 5 (62.5%)| 2 (25.0%)|1 (12.5%) | 8| |c172code | 2| 31 (47.0%)| 33 (50.0%)|2 (3.0%) | 66| |c172code | 3| 4 (25.0%)| 11 (68.8%)|1 (6.2%) | 16| |c172code | (NA)| 5 (50.0%)| 4 (40.0%)|1 (10.0%) | 10| |e42dep | 1| 2 (100.0%)| 0 (0.0%)| 0 (0.0%) | 2| |e42dep | 2| 2 (50.0%)| 2 (50.0%)| 0 (0.0%) | 4| |e42dep | 3| 8 (28.6%)| 18 (64.3%)| 2 (7.1%) | 28| |e42dep | 4| 32 (50.8%)| 28 (44.4%)| 3 (4.8%) | 63| |e42dep | (NA)| 1 (33.3%)| 2 (66.7%)| 0 (0.0%) | 3| --- Code display(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE, weights = "weights")) Output |c172code | male| female| Total| |:--------|----------:|----------:|-----:| |1 | 5 (12.5%)| 3 (6.5%)| 8| |2 | 32 (80.0%)| 32 (69.6%)| 64| |3 | 3 (7.5%)| 11 (23.9%)| 14| | | | | | |Total | 40| 46| 86| --- Code display(data_tabulate(efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row")) Output |Variable | Value| male| female| (NA) | Total| |:--------|-----:|-----------:|----------:|:---------|-----:| |c172code | 1| 5 (62.5%)| 2 (25.0%)|1 (12.5%) | 8| |c172code | 2| 31 (47.0%)| 33 (50.0%)|2 (3.0%) | 66| |c172code | 3| 4 (25.0%)| 11 (68.8%)|1 (6.2%) | 16| |c172code | (NA)| 5 (50.0%)| 4 (40.0%)|1 (10.0%) | 10| |e42dep | 1| 2 (100.0%)| 0 (0.0%)| 0 (0.0%) | 2| |e42dep | 2| 2 (50.0%)| 2 (50.0%)| 0 (0.0%) | 4| |e42dep | 3| 8 (28.6%)| 18 (64.3%)| 2 (7.1%) | 28| |e42dep | 4| 32 (50.8%)| 28 (44.4%)| 3 (4.8%) | 63| |e42dep | (NA)| 1 (33.3%)| 2 (66.7%)| 0 (0.0%) | 3| # data_tabulate, correct 0% for proportions Code print(out[[1]]) Output c172code | male | female | | Total ---------+------------+------------+----------+------ 1 | 5 (10.9%) | 3 (5.6%) | 0 (0.0%) | 8 2 | 32 (69.6%) | 34 (63.0%) | 0 (0.0%) | 66 3 | 4 (8.7%) | 12 (22.2%) | 0 (0.0%) | 16 | 5 (10.9%) | 5 (9.3%) | 0 (0.0%) | 10 ---------+------------+------------+----------+------ Total | 46 | 54 | 0 | 100 # data_tabulate, table methods Code as.table(x) Output [[1]] 4 6 8 11 7 14 --- Code as.table(x) Output [[1]] 4 6 8 11 7 14 --- Code as.table(x, remove_na = FALSE) Output [[1]] 4 6 8 11 7 14 0 --- Code as.table(x) Output [[1]] 4 6 8 11 7 14 [[2]] 3 4 5 15 12 5 --- Code as.table(x) Output [[1]] 3 4 5 4 1 8 2 6 2 4 1 8 12 0 2 --- Code as.table(x, simplify = TRUE) Output 3 4 5 4 1 8 2 6 2 4 1 8 12 0 2 --- Code as.table(x) Output [[1]] 3 4 5 4 1 8 2 6 2 4 1 8 12 0 2 --- Code as.table(x, simplify = TRUE) Output 3 4 5 4 1 8 2 6 2 4 1 8 12 0 2 --- Code as.table(x) Output [[1]] 3 4 5 0 15 4 0 1 0 8 5 [[2]] 3 4 5 4 1 8 2 6 2 4 1 8 12 0 2 --- Code as.table(x) Output $`am (0)` 3 4 4 1 2 6 2 2 8 12 0 $`am (1)` 4 5 4 6 2 6 2 1 8 0 2 ================================================ FILE: tests/testthat/_snaps/data_to_factor.md ================================================ # data_read, convert many labels correctly Code data_tabulate(to_factor(d$selv1)) Output to_factor(d$selv1) # total N=2413 valid N=2413 Value | N | Raw % | Valid % | Cumulative % ---------------------------------------------------+-----+-------+---------+------------- Vignette 1 weiblich (Gülsen E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 6.22 Vignette 2 weiblich (Gülsen E. Anwältin B) | 150 | 6.22 | 6.22 | 12.43 Vignette 3 weiblich (Monika E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 18.65 Vignette 4 weiblich (Monika E. Anwältin B) | 151 | 6.26 | 6.26 | 24.91 Vignette 5 männlich (Hasan E. Reinigungskraft B) | 151 | 6.26 | 6.26 | 31.16 Vignette 6 männlich (Hasan E. Anwalt B) | 153 | 6.34 | 6.34 | 37.51 Vignette 7 männlich (Martin E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 43.72 Vignette 8 männlich (Martin E. Anwalt B) | 150 | 6.22 | 6.22 | 49.94 Vignette 9 weiblich (Gülsen E. Reinigungskraft E) | 151 | 6.26 | 6.26 | 56.20 Vignette 10 weiblich (Gülsen E. Anwältin E) | 150 | 6.22 | 6.22 | 62.41 Vignette 11 weiblich (Monika E. Reinigungskraft E) | 150 | 6.22 | 6.22 | 68.63 Vignette 12 weiblich (Monika E. Anwältin E) | 151 | 6.26 | 6.26 | 74.89 Vignette 13 männlich (Hasan E. Reinigungskraft E) | 155 | 6.42 | 6.42 | 81.31 Vignette 14 männlich (Hasan E. Anwalt E) | 150 | 6.22 | 6.22 | 87.53 Vignette 15 männlich (Martin E. Reinigungskraft E) | 150 | 6.22 | 6.22 | 93.74 Vignette 16 männlich (Martin E. Anwalt E) | 151 | 6.26 | 6.26 | 100.00 | 0 | 0.00 | | --- Code data_tabulate(to_factor(d$c12)) Output Sind oder waren Sie schon einmal selbst von solchen Beschwerden betroffen? (to_factor(d$c12)) # total N=2413 valid N=2413 Value | N | Raw % | Valid % | Cumulative % -------------+------+-------+---------+------------- ja | 786 | 32.57 | 32.57 | 32.57 nein | 1616 | 66.97 | 66.97 | 99.54 keine Angabe | 11 | 0.46 | 0.46 | 100.00 | 0 | 0.00 | | --- Code data_tabulate(to_factor(d$c12a)) Output Haben Sie deswegen Behandlung(en) in Anspruch genommen? (to_factor(d$c12a)) # total N=2413 valid N=2413 Value | N | Raw % | Valid % | Cumulative % -------------+------+-------+---------+------------- Filter | 1627 | 67.43 | 67.43 | 67.43 ja | 500 | 20.72 | 20.72 | 88.15 nein | 285 | 11.81 | 11.81 | 99.96 keine Angabe | 1 | 0.04 | 0.04 | 100.00 | 0 | 0.00 | | --- Code data_tabulate(to_factor(d$c12c)) Output Wie sehr haben diese Behandlung(en) Ihre Beeinträchtigung durch die Beschwerden verbessert? (to_factor(d$c12c)) # total N=2413 valid N=2413 Value | N | Raw % | Valid % | Cumulative % --------------------------+------+-------+---------+------------- Filter | 1913 | 79.28 | 79.28 | 79.28 0 = keine | 34 | 1.41 | 1.41 | 80.69 1 | 2 | 0.08 | 0.08 | 80.77 2 | 11 | 0.46 | 0.46 | 81.23 3 | 14 | 0.58 | 0.58 | 81.81 4 | 19 | 0.79 | 0.79 | 82.59 5 | 61 | 2.53 | 2.53 | 85.12 6 | 42 | 1.74 | 1.74 | 86.86 7 | 63 | 2.61 | 2.61 | 89.47 8 | 97 | 4.02 | 4.02 | 93.49 9 | 53 | 2.20 | 2.20 | 95.69 10 = sehr starke | 99 | 4.10 | 4.10 | 99.79 weiß nicht / keine Angabe | 5 | 0.21 | 0.21 | 100.00 | 0 | 0.00 | | ================================================ FILE: tests/testthat/_snaps/data_to_long.md ================================================ # data_to_long works - complex dataset Code str(long) Output 'data.frame': 70000 obs. of 6 variables: $ gender : int 1 1 1 1 1 1 1 1 1 1 ... $ education : int NA NA NA NA NA NA NA NA NA NA ... $ age : int 16 16 16 16 16 16 16 16 16 16 ... $ Participant: num 61617 61617 61617 61617 61617 ... $ Item : chr "A1" "A2" "A3" "A4" ... $ Score : int 2 4 3 4 4 2 3 3 4 4 ... # don't convert factors to integer Code print(mtcars_long) Output cyl hp drat wt vs am gear carb am_f cyl_f id g value 1 4 93 3.85 2.320 1 1 4 1 1 4 3 mpg 22.80 2 4 93 3.85 2.320 1 1 4 1 1 4 3 qsec 18.61 3 4 93 3.85 2.320 1 1 4 1 1 4 3 disp 108.00 4 8 245 3.21 3.570 0 0 3 4 0 8 7 mpg 14.30 5 8 245 3.21 3.570 0 0 3 4 0 8 7 qsec 15.84 6 8 245 3.21 3.570 0 0 3 4 0 8 7 disp 360.00 7 4 66 4.08 2.200 1 1 4 1 1 4 10 mpg 32.40 8 4 66 4.08 2.200 1 1 4 1 1 4 10 qsec 19.47 9 4 66 4.08 2.200 1 1 4 1 1 4 10 disp 78.70 10 8 264 4.22 3.170 0 1 5 4 1 8 11 mpg 15.80 11 8 264 4.22 3.170 0 1 5 4 1 8 11 qsec 14.50 12 8 264 4.22 3.170 0 1 5 4 1 8 11 disp 351.00 13 6 110 3.08 3.215 1 0 3 1 0 6 4 mpg 21.40 14 6 110 3.08 3.215 1 0 3 1 0 6 4 qsec 19.44 15 6 110 3.08 3.215 1 0 3 1 0 6 4 disp 258.00 16 8 175 3.15 3.440 0 0 3 2 0 8 5 mpg 18.70 17 8 175 3.15 3.440 0 0 3 2 0 8 5 qsec 17.02 18 8 175 3.15 3.440 0 0 3 2 0 8 5 disp 360.00 19 8 335 3.54 3.570 0 1 5 8 1 8 12 mpg 15.00 20 8 335 3.54 3.570 0 1 5 8 1 8 12 qsec 14.60 21 8 335 3.54 3.570 0 1 5 8 1 8 12 disp 301.00 22 6 110 3.90 2.620 0 1 4 4 1 6 1 mpg 21.00 23 6 110 3.90 2.620 0 1 4 4 1 6 1 qsec 16.46 24 6 110 3.90 2.620 0 1 4 4 1 6 1 disp 160.00 25 6 110 3.90 2.875 0 1 4 4 1 6 2 mpg 21.00 26 6 110 3.90 2.875 0 1 4 4 1 6 2 qsec 17.02 27 6 110 3.90 2.875 0 1 4 4 1 6 2 disp 160.00 28 4 95 3.92 3.150 1 0 4 2 0 4 9 mpg 22.80 29 4 95 3.92 3.150 1 0 4 2 0 4 9 qsec 22.90 30 4 95 3.92 3.150 1 0 4 2 0 4 9 disp 140.80 31 4 62 3.69 3.190 1 0 4 2 0 4 8 mpg 24.40 32 4 62 3.69 3.190 1 0 4 2 0 4 8 qsec 20.00 33 4 62 3.69 3.190 1 0 4 2 0 4 8 disp 146.70 34 6 105 2.76 3.460 1 0 3 1 0 6 6 mpg 18.10 35 6 105 2.76 3.460 1 0 3 1 0 6 6 qsec 20.22 36 6 105 2.76 3.460 1 0 3 1 0 6 6 disp 225.00 ================================================ FILE: tests/testthat/_snaps/data_to_numeric.md ================================================ # convert data frame to numeric Code to_numeric(head(ToothGrowth), dummy_factors = TRUE) Output len supp.OJ supp.VC dose 1 4.2 0 1 0.5 2 11.5 0 1 0.5 3 7.3 0 1 0.5 4 5.8 0 1 0.5 5 6.4 0 1 0.5 6 10.0 0 1 0.5 --- Code to_numeric(head(ToothGrowth), dummy_factors = FALSE) Output len supp dose 1 4.2 2 0.5 2 11.5 2 0.5 3 7.3 2 0.5 4 5.8 2 0.5 5 6.4 2 0.5 6 10.0 2 0.5 # convert factor to numeric Code to_numeric(f, dummy_factors = TRUE) Output a c i s t 1 0 0 0 1 0 2 0 0 0 0 1 3 1 0 0 0 0 4 0 0 0 0 1 5 0 0 1 0 0 6 0 0 0 1 0 7 0 0 0 0 1 8 0 0 1 0 0 9 0 1 0 0 0 10 0 0 0 1 0 ================================================ FILE: tests/testthat/_snaps/demean.md ================================================ # demean works Code head(x) Output Sepal.Length_between Petal.Length_between Sepal.Length_within 1 5.925000 3.527500 -0.8250000 2 5.925000 3.527500 -1.0250000 3 5.925000 3.527500 -1.2250000 4 5.862222 3.951111 -1.2622222 5 5.925000 3.527500 -0.9250000 6 5.862222 3.951111 -0.4622222 Petal.Length_within 1 -2.127500 2 -2.127500 3 -2.227500 4 -2.451111 5 -2.127500 6 -2.251111 --- Code head(x) Output Sepal.Length_between binary_between Species_between Species_setosa_between 1 5.925000 0.375 0.850000 0.4250000 2 5.925000 0.375 0.850000 0.4250000 3 5.925000 0.375 0.850000 0.4250000 4 5.862222 0.400 1.133333 0.2888889 5 5.925000 0.375 0.850000 0.4250000 6 5.862222 0.400 1.133333 0.2888889 Species_versicolor_between Species_virginica_between Sepal.Length_within 1 0.3000000 0.2750000 -0.8250000 2 0.3000000 0.2750000 -1.0250000 3 0.3000000 0.2750000 -1.2250000 4 0.2888889 0.4222222 -1.2622222 5 0.3000000 0.2750000 -0.9250000 6 0.2888889 0.4222222 -0.4622222 binary_within Species_within Species_setosa_within Species_versicolor_within 1 -0.375 -0.850000 0.5750000 -0.3000000 2 0.625 -0.850000 0.5750000 -0.3000000 3 -0.375 -0.850000 0.5750000 -0.3000000 4 0.600 -1.133333 0.7111111 -0.2888889 5 0.625 -0.850000 0.5750000 -0.3000000 6 -0.400 -1.133333 0.7111111 -0.2888889 Species_virginica_within 1 -0.2750000 2 -0.2750000 3 -0.2750000 4 -0.4222222 5 -0.2750000 6 -0.4222222 --- Code head(x) Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID binary 1 5.1 3.5 1.4 0.2 setosa 3 0 2 4.9 3.0 1.4 0.2 setosa 3 1 3 4.7 3.2 1.3 0.2 setosa 3 0 4 4.6 3.1 1.5 0.2 setosa 2 1 5 5.0 3.6 1.4 0.2 setosa 3 1 6 5.4 3.9 1.7 0.4 setosa 2 0 Sepal.Length_between Petal.Length_between Sepal.Length_within 1 5.925000 3.527500 -0.8250000 2 5.925000 3.527500 -1.0250000 3 5.925000 3.527500 -1.2250000 4 5.862222 3.951111 -1.2622222 5 5.925000 3.527500 -0.9250000 6 5.862222 3.951111 -0.4622222 Petal.Length_within 1 -2.127500 2 -2.127500 3 -2.227500 4 -2.451111 5 -2.127500 6 -2.251111 # demean interaction term Code demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE) Output a_between x_y_between a_within x_y_within 1 2.666667 4.666667 -1.6666667 -0.6666667 2 2.333333 4.000000 -0.3333333 2.0000000 3 2.500000 4.500000 0.5000000 -1.5000000 4 2.666667 4.666667 1.3333333 3.3333333 5 2.333333 4.000000 -1.3333333 0.0000000 6 2.500000 4.500000 -0.5000000 1.5000000 7 2.666667 4.666667 0.3333333 -2.6666667 8 2.333333 4.000000 1.6666667 -2.0000000 ================================================ FILE: tests/testthat/_snaps/describe_distribution.md ================================================ # describe_distribution - factor Code describe_distribution(factor(substring("statistics", 1:10, 1:10))) Output Mean | SD | Range | Skewness | Kurtosis | n | n_Missing --------------------------------------------------------- | | [a, t] | -0.77 | -0.13 | 10 | 0 # describe_distribution - character Code describe_distribution(as.character(ToothGrowth$supp)) Output Mean | SD | Range | Skewness | Kurtosis | n | n_Missing ----------------------------------------------------------- | | [VC, OJ] | 0 | -2.07 | 60 | 0 # describe_distribution - grouped df Code out Output Species | Variable | Mean | SD | IQR | Range | Skewness ------------------------------------------------------------------------ setosa | Petal.Length | 1.46 | 0.17 | 0.20 | [1.00, 1.90] | 0.11 setosa | Petal.Width | 0.25 | 0.11 | 0.10 | [0.10, 0.60] | 1.25 versicolor | Petal.Length | 4.26 | 0.47 | 0.60 | [3.00, 5.10] | -0.61 versicolor | Petal.Width | 1.33 | 0.20 | 0.30 | [1.00, 1.80] | -0.03 virginica | Petal.Length | 5.55 | 0.55 | 0.80 | [4.50, 6.90] | 0.55 virginica | Petal.Width | 2.03 | 0.27 | 0.50 | [1.40, 2.50] | -0.13 Species | Kurtosis | n | n_Missing -------------------------------------- setosa | 1.02 | 50 | 0 setosa | 1.72 | 50 | 0 versicolor | 0.05 | 50 | 0 versicolor | -0.41 | 50 | 0 virginica | -0.15 | 50 | 0 virginica | -0.60 | 50 | 0 # describe_distribution - grouped df and multiple groups Code describe_distribution(x) Output grp1 | grp2 | Variable | Mean | SD | IQR | Range | Skewness -------------------------------------------------------------------------- a | a | values | 10.00 | 6.48 | 12.00 | [1.00, 19.00] | 0.00 b | a | values | 13.86 | 10.92 | 21.00 | [1.00, 28.00] | 0.23 c | a | values | 20.50 | 5.61 | 10.50 | [13.00, 28.00] | 0.00 a | b | values | 11.00 | 6.48 | 12.00 | [2.00, 20.00] | 0.00 b | b | values | 15.50 | 11.81 | 22.50 | [2.00, 29.00] | 0.00 c | b | values | 20.00 | 6.48 | 12.00 | [11.00, 29.00] | 0.00 a | c | values | 10.50 | 5.61 | 10.50 | [3.00, 18.00] | 0.00 b | c | values | 17.14 | 10.92 | 21.00 | [3.00, 30.00] | -0.23 c | c | values | 21.00 | 6.48 | 12.00 | [12.00, 30.00] | 0.00 grp1 | Kurtosis | n | n_Missing ------------------------------- a | -1.20 | 7 | 0 b | -2.14 | 7 | 0 c | -1.20 | 6 | 0 a | -1.20 | 7 | 0 b | -2.76 | 6 | 0 c | -1.20 | 7 | 0 a | -1.20 | 6 | 0 b | -2.14 | 7 | 0 c | -1.20 | 7 | 0 # describe_distribution formatting Code format(x) Output Mean | SD | IQR | Range | Quartiles | Skewness | Kurtosis | n | n_Missing -------------------------------------------------------------------------------------- 3.06 | 0.44 | 0.52 | [2.00, 4.40] | 2.80, 3.30 | 0.32 | 0.23 | 150 | 0 # (multiple) centralities with CIs Code print(out, table_width = Inf) Output Median | 95% CI (Median) | MAD | Mean | 95% CI (Mean) | SD | MAP | 95% CI (MAP) | IQR | Range | Skewness | Kurtosis | n | n_Missing ------------------------------------------------------------------------------------------------------------------------------------------------- 3 | [3.00, 3.10] | 0.44 | 3.06 | [3.00, 3.13] | 0.44 | 3.00 | [2.94, 3.07] | 0.52 | [2.00, 4.40] | 0.32 | 0.23 | 150 | 0 --- Code print(out, table_width = Inf) Output Mean | 95% CI (Mean) | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing ----------------------------------------------------------------------------------------- 3.06 | [3.00, 3.13] | 0.44 | 0.52 | [2.00, 4.40] | 0.32 | 0.23 | 150 | 0 --- Code print(out, table_width = Inf) Output Median | 95% CI (Median) | MAD | MAP | 95% CI (MAP) | IQR | Range | Skewness | Kurtosis | n | n_Missing ------------------------------------------------------------------------------------------------------------------- 3 | [3.00, 3.10] | 0.44 | 3.00 | [2.91, 3.09] | 0.52 | [2.00, 4.40] | 0.32 | 0.23 | 150 | 0 # display() method exports to markdown Code display(out) Output |Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing | |:------------|:----:|:----:|:----:|:------------:|:--------:|:--------:|:---:|:---------:| |Sepal.Length | 5.84 | 0.83 | 1.30 | (4.30, 7.90) | 0.31 | -0.55 | 150 | 0 | |Sepal.Width | 3.06 | 0.44 | 0.52 | (2.00, 4.40) | 0.32 | 0.23 | 150 | 0 | |Petal.Length | 3.76 | 1.77 | 3.52 | (1.00, 6.90) | -0.27 | -1.40 | 150 | 0 | |Petal.Width | 1.20 | 0.76 | 1.50 | (0.10, 2.50) | -0.10 | -1.34 | 150 | 0 | # display() method exports to tinytable Code display(out, format = "tt") Output +--------------+------+------+------+--------------+----------+----------+-----+-----------+ | Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing | +==============+======+======+======+==============+==========+==========+=====+===========+ | Sepal.Length | 5.84 | 0.83 | 1.30 | (4.30, 7.90) | 0.31 | -0.55 | 150 | 0 | +--------------+------+------+------+--------------+----------+----------+-----+-----------+ | Sepal.Width | 3.06 | 0.44 | 0.52 | (2.00, 4.40) | 0.32 | 0.23 | 150 | 0 | +--------------+------+------+------+--------------+----------+----------+-----+-----------+ | Petal.Length | 3.76 | 1.77 | 3.52 | (1.00, 6.90) | -0.27 | -1.40 | 150 | 0 | +--------------+------+------+------+--------------+----------+----------+-----+-----------+ | Petal.Width | 1.20 | 0.76 | 1.50 | (0.10, 2.50) | -0.10 | -1.34 | 150 | 0 | +--------------+------+------+------+--------------+----------+----------+-----+-----------+ ================================================ FILE: tests/testthat/_snaps/empty-dataframe.md ================================================ # remove empty with character Code remove_empty_columns(tmp) Output a b d 1 1 1 1 2 2 NA NA 3 3 3 3 4 NA NA NA 5 5 5 5 --- Code remove_empty_rows(tmp) Output a b c d 1 1 1 NA 1 2 2 NA NA NA 3 3 3 NA 3 5 5 5 NA 5 --- Code remove_empty(tmp) Output a b d 1 1 1 1 2 2 NA NA 3 3 3 3 5 5 5 5 ================================================ FILE: tests/testthat/_snaps/means_by_group.md ================================================ # mean_by_group Code means_by_group(efc, "c12hour", "e42dep") Output # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | 95% CI | p ---------------------------------------------------------------------- independent | 17.00 | 2 | 11.31 | [-68.46, 102.46] | 0.573 slightly dependent | 34.25 | 4 | 29.97 | [-26.18, 94.68] | 0.626 moderately dependent | 52.75 | 28 | 51.83 | [ 29.91, 75.59] | > .999 severely dependent | 106.97 | 63 | 65.88 | [ 91.74, 122.19] | 0.001 Total | 86.46 | 97 | 66.40 | | Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001 --- Code means_by_group(efc, "c12hour", "e42dep", ci = 0.99) Output # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | 99% CI | p ---------------------------------------------------------------------- independent | 17.00 | 2 | 11.31 | [-96.17, 130.17] | 0.573 slightly dependent | 34.25 | 4 | 29.97 | [-45.77, 114.27] | 0.626 moderately dependent | 52.75 | 28 | 51.83 | [ 22.50, 83.00] | > .999 severely dependent | 106.97 | 63 | 65.88 | [ 86.80, 127.13] | 0.001 Total | 86.46 | 97 | 66.40 | | Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001 --- Code means_by_group(efc, "c12hour", "e42dep", ci = NA) Output # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | p --------------------------------------------------- independent | 17.00 | 2 | 11.31 | 0.573 slightly dependent | 34.25 | 4 | 29.97 | 0.626 moderately dependent | 52.75 | 28 | 51.83 | > .999 severely dependent | 106.97 | 63 | 65.88 | 0.001 Total | 86.46 | 97 | 66.40 | Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001 --- Code means_by_group(efc, c("neg_c_7", "c12hour"), "e42dep") Output # Mean of Negative impact with 7 items by elder's dependency Category | Mean | N | SD | 95% CI | p ----------------------------------------------------------------- independent | 11.00 | 2 | 0.00 | [ 5.00, 17.00] | 0.567 slightly dependent | 10.00 | 4 | 3.16 | [ 5.76, 14.24] | 0.296 moderately dependent | 13.71 | 28 | 3.14 | [12.11, 15.32] | 0.296 severely dependent | 14.67 | 60 | 4.78 | [13.57, 15.76] | 0.108 Total | 14.11 | 94 | 4.34 | | Anova: R2=0.063; adj.R2=0.032; F=2.009; p=0.118 # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | 95% CI | p ---------------------------------------------------------------------- independent | 17.00 | 2 | 11.31 | [-68.46, 102.46] | 0.573 slightly dependent | 34.25 | 4 | 29.97 | [-26.18, 94.68] | 0.626 moderately dependent | 52.75 | 28 | 51.83 | [ 29.91, 75.59] | > .999 severely dependent | 106.97 | 63 | 65.88 | [ 91.74, 122.19] | 0.001 Total | 86.46 | 97 | 66.40 | | Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001 --- Code means_by_group(efc, c("neg_c_7", "c12hour"), "e42dep", ci = NA) Output # Mean of Negative impact with 7 items by elder's dependency Category | Mean | N | SD | p ------------------------------------------------ independent | 11.00 | 2 | 0.00 | 0.567 slightly dependent | 10.00 | 4 | 3.16 | 0.296 moderately dependent | 13.71 | 28 | 3.14 | 0.296 severely dependent | 14.67 | 60 | 4.78 | 0.108 Total | 14.11 | 94 | 4.34 | Anova: R2=0.063; adj.R2=0.032; F=2.009; p=0.118 # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | p --------------------------------------------------- independent | 17.00 | 2 | 11.31 | 0.573 slightly dependent | 34.25 | 4 | 29.97 | 0.626 moderately dependent | 52.75 | 28 | 51.83 | > .999 severely dependent | 106.97 | 63 | 65.88 | 0.001 Total | 86.46 | 97 | 66.40 | Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001 --- Code means_by_group(efc, c("neg_c_7", "c12hour"), "e42dep", ci = 0.99) Output # Mean of Negative impact with 7 items by elder's dependency Category | Mean | N | SD | 99% CI | p ----------------------------------------------------------------- independent | 11.00 | 2 | 0.00 | [ 3.05, 18.95] | 0.567 slightly dependent | 10.00 | 4 | 3.16 | [ 4.38, 15.62] | 0.296 moderately dependent | 13.71 | 28 | 3.14 | [11.59, 15.84] | 0.296 severely dependent | 14.67 | 60 | 4.78 | [13.22, 16.12] | 0.108 Total | 14.11 | 94 | 4.34 | | Anova: R2=0.063; adj.R2=0.032; F=2.009; p=0.118 # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | 99% CI | p ---------------------------------------------------------------------- independent | 17.00 | 2 | 11.31 | [-96.17, 130.17] | 0.573 slightly dependent | 34.25 | 4 | 29.97 | [-45.77, 114.27] | 0.626 moderately dependent | 52.75 | 28 | 51.83 | [ 22.50, 83.00] | > .999 severely dependent | 106.97 | 63 | 65.88 | [ 86.80, 127.13] | 0.001 Total | 86.46 | 97 | 66.40 | | Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001 --- Code means_by_group(efc$c12hour, efc$e42dep) Output # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | 95% CI | p ---------------------------------------------------------------------- independent | 17.00 | 2 | 11.31 | [-68.46, 102.46] | 0.573 slightly dependent | 34.25 | 4 | 29.97 | [-26.18, 94.68] | 0.626 moderately dependent | 52.75 | 28 | 51.83 | [ 29.91, 75.59] | > .999 severely dependent | 106.97 | 63 | 65.88 | [ 91.74, 122.19] | 0.001 Total | 86.46 | 97 | 66.40 | | Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001 --- Code means_by_group(efc$c12hour, efc$e42dep, ci = NA) Output # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | p --------------------------------------------------- independent | 17.00 | 2 | 11.31 | 0.573 slightly dependent | 34.25 | 4 | 29.97 | 0.626 moderately dependent | 52.75 | 28 | 51.83 | > .999 severely dependent | 106.97 | 63 | 65.88 | 0.001 Total | 86.46 | 97 | 66.40 | Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001 ================================================ FILE: tests/testthat/_snaps/normalize.md ================================================ # normalize work as expected Code head(normalize(trees)) Output Girth Height Volume 1 0.00000000 0.29166667 0.001497006 2 0.02439024 0.08333333 0.001497006 3 0.04065041 0.00000000 0.000000000 4 0.17886179 0.37500000 0.092814371 5 0.19512195 0.75000000 0.128742515 6 0.20325203 0.83333333 0.142215569 ================================================ FILE: tests/testthat/_snaps/print.dw_transformer.md ================================================ # print.dw_transformer Code rescale(iris$Sepal.Length) Output [1] 22.222222 16.666667 11.111111 8.333333 19.444444 30.555556 [7] 8.333333 19.444444 2.777778 16.666667 30.555556 13.888889 [13] 13.888889 0.000000 41.666667 38.888889 30.555556 22.222222 [19] 38.888889 22.222222 30.555556 22.222222 8.333333 22.222222 [25] 13.888889 19.444444 19.444444 25.000000 25.000000 11.111111 [31] 13.888889 30.555556 25.000000 33.333333 16.666667 19.444444 [37] 33.333333 16.666667 2.777778 22.222222 19.444444 5.555556 [43] 2.777778 19.444444 22.222222 13.888889 22.222222 8.333333 [49] 27.777778 19.444444 75.000000 58.333333 72.222222 33.333333 [55] 61.111111 38.888889 55.555556 16.666667 63.888889 25.000000 [61] 19.444444 44.444444 47.222222 50.000000 36.111111 66.666667 [67] 36.111111 41.666667 52.777778 36.111111 44.444444 50.000000 [73] 55.555556 50.000000 58.333333 63.888889 69.444444 66.666667 [79] 47.222222 38.888889 33.333333 33.333333 41.666667 47.222222 [85] 30.555556 47.222222 66.666667 55.555556 36.111111 33.333333 [91] 33.333333 50.000000 41.666667 19.444444 36.111111 38.888889 [97] 38.888889 52.777778 22.222222 38.888889 55.555556 41.666667 [103] 77.777778 55.555556 61.111111 91.666667 16.666667 83.333333 [109] 66.666667 80.555556 61.111111 58.333333 69.444444 38.888889 [115] 41.666667 58.333333 61.111111 94.444444 94.444444 47.222222 [121] 72.222222 36.111111 94.444444 55.555556 66.666667 80.555556 [127] 52.777778 50.000000 58.333333 80.555556 86.111111 100.000000 [133] 58.333333 55.555556 50.000000 94.444444 55.555556 58.333333 [139] 47.222222 72.222222 66.666667 72.222222 41.666667 69.444444 [145] 66.666667 66.666667 55.555556 61.111111 52.777778 44.444444 (original range = 4.3 to 7.9) --- Code normalize(iris$Sepal.Length) Output [1] 0.22222222 0.16666667 0.11111111 0.08333333 0.19444444 0.30555556 [7] 0.08333333 0.19444444 0.02777778 0.16666667 0.30555556 0.13888889 [13] 0.13888889 0.00000000 0.41666667 0.38888889 0.30555556 0.22222222 [19] 0.38888889 0.22222222 0.30555556 0.22222222 0.08333333 0.22222222 [25] 0.13888889 0.19444444 0.19444444 0.25000000 0.25000000 0.11111111 [31] 0.13888889 0.30555556 0.25000000 0.33333333 0.16666667 0.19444444 [37] 0.33333333 0.16666667 0.02777778 0.22222222 0.19444444 0.05555556 [43] 0.02777778 0.19444444 0.22222222 0.13888889 0.22222222 0.08333333 [49] 0.27777778 0.19444444 0.75000000 0.58333333 0.72222222 0.33333333 [55] 0.61111111 0.38888889 0.55555556 0.16666667 0.63888889 0.25000000 [61] 0.19444444 0.44444444 0.47222222 0.50000000 0.36111111 0.66666667 [67] 0.36111111 0.41666667 0.52777778 0.36111111 0.44444444 0.50000000 [73] 0.55555556 0.50000000 0.58333333 0.63888889 0.69444444 0.66666667 [79] 0.47222222 0.38888889 0.33333333 0.33333333 0.41666667 0.47222222 [85] 0.30555556 0.47222222 0.66666667 0.55555556 0.36111111 0.33333333 [91] 0.33333333 0.50000000 0.41666667 0.19444444 0.36111111 0.38888889 [97] 0.38888889 0.52777778 0.22222222 0.38888889 0.55555556 0.41666667 [103] 0.77777778 0.55555556 0.61111111 0.91666667 0.16666667 0.83333333 [109] 0.66666667 0.80555556 0.61111111 0.58333333 0.69444444 0.38888889 [115] 0.41666667 0.58333333 0.61111111 0.94444444 0.94444444 0.47222222 [121] 0.72222222 0.36111111 0.94444444 0.55555556 0.66666667 0.80555556 [127] 0.52777778 0.50000000 0.58333333 0.80555556 0.86111111 1.00000000 [133] 0.58333333 0.55555556 0.50000000 0.94444444 0.55555556 0.58333333 [139] 0.47222222 0.72222222 0.66666667 0.72222222 0.41666667 0.69444444 [145] 0.66666667 0.66666667 0.55555556 0.61111111 0.52777778 0.44444444 (original range = 4.3 to 7.9) --- Code center(iris$Sepal.Length) Output [1] -0.74333333 -0.94333333 -1.14333333 -1.24333333 -0.84333333 -0.44333333 [7] -1.24333333 -0.84333333 -1.44333333 -0.94333333 -0.44333333 -1.04333333 [13] -1.04333333 -1.54333333 -0.04333333 -0.14333333 -0.44333333 -0.74333333 [19] -0.14333333 -0.74333333 -0.44333333 -0.74333333 -1.24333333 -0.74333333 [25] -1.04333333 -0.84333333 -0.84333333 -0.64333333 -0.64333333 -1.14333333 [31] -1.04333333 -0.44333333 -0.64333333 -0.34333333 -0.94333333 -0.84333333 [37] -0.34333333 -0.94333333 -1.44333333 -0.74333333 -0.84333333 -1.34333333 [43] -1.44333333 -0.84333333 -0.74333333 -1.04333333 -0.74333333 -1.24333333 [49] -0.54333333 -0.84333333 1.15666667 0.55666667 1.05666667 -0.34333333 [55] 0.65666667 -0.14333333 0.45666667 -0.94333333 0.75666667 -0.64333333 [61] -0.84333333 0.05666667 0.15666667 0.25666667 -0.24333333 0.85666667 [67] -0.24333333 -0.04333333 0.35666667 -0.24333333 0.05666667 0.25666667 [73] 0.45666667 0.25666667 0.55666667 0.75666667 0.95666667 0.85666667 [79] 0.15666667 -0.14333333 -0.34333333 -0.34333333 -0.04333333 0.15666667 [85] -0.44333333 0.15666667 0.85666667 0.45666667 -0.24333333 -0.34333333 [91] -0.34333333 0.25666667 -0.04333333 -0.84333333 -0.24333333 -0.14333333 [97] -0.14333333 0.35666667 -0.74333333 -0.14333333 0.45666667 -0.04333333 [103] 1.25666667 0.45666667 0.65666667 1.75666667 -0.94333333 1.45666667 [109] 0.85666667 1.35666667 0.65666667 0.55666667 0.95666667 -0.14333333 [115] -0.04333333 0.55666667 0.65666667 1.85666667 1.85666667 0.15666667 [121] 1.05666667 -0.24333333 1.85666667 0.45666667 0.85666667 1.35666667 [127] 0.35666667 0.25666667 0.55666667 1.35666667 1.55666667 2.05666667 [133] 0.55666667 0.45666667 0.25666667 1.85666667 0.45666667 0.55666667 [139] 0.15666667 1.05666667 0.85666667 1.05666667 -0.04333333 0.95666667 [145] 0.85666667 0.85666667 0.45666667 0.65666667 0.35666667 0.05666667 (center: 5.8, scale = 1) --- Code standardize(iris$Sepal.Length) Output [1] -0.89767388 -1.13920048 -1.38072709 -1.50149039 -1.01843718 -0.53538397 [7] -1.50149039 -1.01843718 -1.74301699 -1.13920048 -0.53538397 -1.25996379 [13] -1.25996379 -1.86378030 -0.05233076 -0.17309407 -0.53538397 -0.89767388 [19] -0.17309407 -0.89767388 -0.53538397 -0.89767388 -1.50149039 -0.89767388 [25] -1.25996379 -1.01843718 -1.01843718 -0.77691058 -0.77691058 -1.38072709 [31] -1.25996379 -0.53538397 -0.77691058 -0.41462067 -1.13920048 -1.01843718 [37] -0.41462067 -1.13920048 -1.74301699 -0.89767388 -1.01843718 -1.62225369 [43] -1.74301699 -1.01843718 -0.89767388 -1.25996379 -0.89767388 -1.50149039 [49] -0.65614727 -1.01843718 1.39682886 0.67224905 1.27606556 -0.41462067 [55] 0.79301235 -0.17309407 0.55148575 -1.13920048 0.91377565 -0.77691058 [61] -1.01843718 0.06843254 0.18919584 0.30995914 -0.29385737 1.03453895 [67] -0.29385737 -0.05233076 0.43072244 -0.29385737 0.06843254 0.30995914 [73] 0.55148575 0.30995914 0.67224905 0.91377565 1.15530226 1.03453895 [79] 0.18919584 -0.17309407 -0.41462067 -0.41462067 -0.05233076 0.18919584 [85] -0.53538397 0.18919584 1.03453895 0.55148575 -0.29385737 -0.41462067 [91] -0.41462067 0.30995914 -0.05233076 -1.01843718 -0.29385737 -0.17309407 [97] -0.17309407 0.43072244 -0.89767388 -0.17309407 0.55148575 -0.05233076 [103] 1.51759216 0.55148575 0.79301235 2.12140867 -1.13920048 1.75911877 [109] 1.03453895 1.63835547 0.79301235 0.67224905 1.15530226 -0.17309407 [115] -0.05233076 0.67224905 0.79301235 2.24217198 2.24217198 0.18919584 [121] 1.27606556 -0.29385737 2.24217198 0.55148575 1.03453895 1.63835547 [127] 0.43072244 0.30995914 0.67224905 1.63835547 1.87988207 2.48369858 [133] 0.67224905 0.55148575 0.30995914 2.24217198 0.55148575 0.67224905 [139] 0.18919584 1.27606556 1.03453895 1.27606556 -0.05233076 1.15530226 [145] 1.03453895 1.03453895 0.55148575 0.79301235 0.43072244 0.06843254 (center: 5.8, scale = 0.83) ================================================ FILE: tests/testthat/_snaps/ranktransform.md ================================================ # ranktransform works with data frames Code ranktransform(BOD) Output Time demand 1 1 1 2 2 2 3 3 5 4 4 4 5 5 3 6 6 6 ================================================ FILE: tests/testthat/_snaps/rescale_weights.md ================================================ # rescale_weights works as expected Code head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")) Output total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a 1 1 2.20 1 3 2 31 97593.68 1.5733612 2 7 2.08 2 3 1 29 39599.36 0.6231745 3 3 1.48 2 1 2 42 26619.83 0.8976966 4 4 1.32 2 4 2 33 34998.53 0.7083628 5 1 2.00 2 1 1 41 14746.45 0.4217782 6 6 2.20 2 4 1 38 28232.10 0.6877550 rescaled_weights_b 1 1.2005159 2 0.5246593 3 0.5439111 4 0.5498944 5 0.3119698 6 0.5155503 --- Code head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU"))) Output total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweight_a_SDMVSTRA 1 1 2.20 1 3 2 31 97593.68 1.5733612 2 7 2.08 2 3 1 29 39599.36 0.6231745 3 3 1.48 2 1 2 42 26619.83 0.8976966 4 4 1.32 2 4 2 33 34998.53 0.7083628 5 1 2.00 2 1 1 41 14746.45 0.4217782 6 6 2.20 2 4 1 38 28232.10 0.6877550 pweight_b_SDMVSTRA pweight_a_SDMVPSU pweight_b_SDMVPSU 1 1.2005159 1.8458164 1.3699952 2 0.5246593 0.8217570 0.5780808 3 0.5439111 0.5034683 0.3736824 4 0.5498944 0.6619369 0.4913004 5 0.3119698 0.3060151 0.2152722 6 0.5155503 0.5858662 0.4121388 --- Code head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish")) Output total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights 1 1 2.20 1 3 2 31 97593.68 1.3952529 2 7 2.08 2 3 1 29 39599.36 0.5661343 3 3 1.48 2 1 2 42 26619.83 0.3805718 4 4 1.32 2 4 2 33 34998.53 0.5003582 5 1 2.00 2 1 1 41 14746.45 0.2108234 6 6 2.20 2 4 1 38 28232.10 0.4036216 --- Code rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") Output total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a 1 1 2.20 1 3 2 31 97593.68 1.0000000 2 7 2.08 2 3 1 29 39599.36 0.5819119 3 3 1.48 2 1 2 42 NA NA 4 4 1.32 2 4 2 33 34998.53 0.6766764 5 1 2.00 2 1 1 41 14746.45 0.7471696 6 6 2.20 2 4 1 38 28232.10 1.0000000 7 350 1.60 1 3 2 33 93162.43 1.8012419 8 NA 1.48 2 3 1 29 82275.99 1.2090441 9 3 2.28 2 4 1 41 24726.39 1.2528304 10 30 0.84 1 3 2 35 NA NA 11 70 1.24 1 4 2 33 27002.70 0.5220817 12 5 1.68 2 1 2 39 18792.03 1.0000000 13 60 2.20 1 3 2 30 76894.56 1.0000000 14 2 1.48 2 3 1 29 NA NA 15 8 2.36 2 3 2 39 NA NA 16 3 2.04 2 3 2 36 98200.91 1.0000000 17 1 2.08 1 3 1 40 87786.09 1.0000000 18 7 1.00 1 3 2 32 90803.16 1.0000000 19 9 2.28 2 3 2 34 NA NA 20 2 1.24 2 3 1 29 82275.99 1.2090441 rescaled_weights_b 1 1.0000000 2 0.5351412 3 NA 4 0.5107078 5 0.7022777 6 1.0000000 7 1.3594509 8 1.1118681 9 1.1775572 10 NA 11 0.3940306 12 1.0000000 13 1.0000000 14 NA 15 NA 16 1.0000000 17 1.0000000 18 1.0000000 19 NA 20 1.1118681 --- Code rescale_weights(nhanes_sample, "WTINT2YR", method = "kish") Output total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights 1 1 2.20 1 3 2 31 97593.68 1.2734329 2 7 2.08 2 3 1 29 39599.36 0.5167049 3 3 1.48 2 1 2 42 NA NA 4 4 1.32 2 4 2 33 34998.53 0.4566718 5 1 2.00 2 1 1 41 14746.45 0.1924164 6 6 2.20 2 4 1 38 28232.10 0.3683813 7 350 1.60 1 3 2 33 93162.43 1.2156126 8 NA 1.48 2 3 1 29 82275.99 1.0735629 9 3 2.28 2 4 1 41 24726.39 0.3226377 10 30 0.84 1 3 2 35 NA NA 11 70 1.24 1 4 2 33 27002.70 0.3523397 12 5 1.68 2 1 2 39 18792.03 0.2452044 13 60 2.20 1 3 2 30 76894.56 1.0033444 14 2 1.48 2 3 1 29 NA NA 15 8 2.36 2 3 2 39 NA NA 16 3 2.04 2 3 2 36 98200.91 1.2813563 17 1 2.08 1 3 1 40 87786.09 1.1454605 18 7 1.00 1 3 2 32 90803.16 1.1848281 19 9 2.28 2 3 2 34 NA NA 20 2 1.24 2 3 1 29 82275.99 1.0735629 # rescale_weights nested works as expected Code rescale_weights(data = head(nhanes_sample, n = 30), by = c("SDMVSTRA", "SDMVPSU"), probability_weights = "WTINT2YR", nest = TRUE) Output total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a 1 1 2.20 1 3 2 31 97593.679 1.0000000 2 7 2.08 2 3 1 29 39599.363 0.5502486 3 3 1.48 2 1 2 42 26619.834 0.9512543 4 4 1.32 2 4 2 33 34998.530 0.6766764 5 1 2.00 2 1 1 41 14746.454 0.7147710 6 6 2.20 2 4 1 38 28232.100 1.0000000 7 350 1.60 1 3 2 33 93162.431 1.8012419 8 NA 1.48 2 3 1 29 82275.986 1.1432570 9 3 2.28 2 4 1 41 24726.391 1.1985056 10 30 0.84 1 3 2 35 39895.048 1.0000000 11 70 1.24 1 4 2 33 27002.703 0.5220817 12 5 1.68 2 1 2 39 18792.034 0.3866720 13 60 2.20 1 3 2 30 76894.563 1.0000000 14 2 1.48 2 3 1 29 82275.986 1.1432570 15 8 2.36 2 3 2 39 78406.811 1.6133280 16 3 2.04 2 3 2 36 98200.912 1.0000000 17 1 2.08 1 3 1 40 87786.091 1.0000000 18 7 1.00 1 3 2 32 90803.158 1.2693642 19 9 2.28 2 3 2 34 45002.917 1.0000000 20 2 1.24 2 3 1 29 82275.986 1.1432570 21 4 2.28 2 3 1 34 91437.145 1.4088525 22 3 1.04 1 1 2 42 29348.027 1.0487457 23 4 1.12 1 1 1 34 38366.567 0.5911475 24 1 1.52 2 1 1 42 6622.334 1.0000000 25 22 2.24 1 4 1 41 22420.209 1.0867233 26 7 1.00 2 3 2 41 65529.204 1.0000000 27 5 0.92 2 4 1 30 27089.745 1.0000000 28 15 1.04 1 3 2 32 52265.570 0.7306358 29 3 0.80 1 3 1 33 64789.307 1.0000000 30 1 1.00 1 3 1 29 73404.222 1.0199804 rescaled_weights_b 1 1.0000000 2 0.5226284 3 0.9489993 4 0.5107078 5 0.6854605 6 1.0000000 7 1.3594509 8 1.0858702 9 1.1493587 10 1.0000000 11 0.3940306 12 0.2809766 13 1.0000000 14 1.0858702 15 1.1723308 16 1.0000000 17 1.0000000 18 1.1834934 19 1.0000000 20 1.0858702 21 1.2070771 22 1.0462596 23 0.5064835 24 1.0000000 25 1.0421602 26 1.0000000 27 1.0000000 28 0.6812093 29 1.0000000 30 0.9687816 ================================================ FILE: tests/testthat/_snaps/reshape_ci.md ================================================ # reshape_ci with single CI level Code df_reshape Output Parameter CI_low CI_high CI 1 Term 1 0.2 0.5 0.8 # reshape_ci with multiple CI levels Code reshape_ci(x) Output Parameter CI_low_0.8 CI_high_0.8 CI_low_0.9 CI_high_0.9 1 Term 1 0.2 0.5 0.10 0.80 2 Term 2 0.3 0.6 0.15 0.85 --- Code reshape_ci(reshape_ci(x)) Output Parameter CI CI_low CI_high 1 Term 1 0.8 0.20 0.50 2 Term 1 0.9 0.10 0.80 3 Term 2 0.8 0.30 0.60 4 Term 2 0.9 0.15 0.85 ================================================ FILE: tests/testthat/_snaps/skewness-kurtosis.md ================================================ # skewness works with data frames Code skewness(iris[, 1:4]) Output Parameter | Skewness | SE ------------------------------- Sepal.Length | 0.315 | 0.196 Sepal.Width | 0.319 | 0.196 Petal.Length | -0.275 | 0.196 Petal.Width | -0.103 | 0.196 --- Code skewness(iris[, 1:4], iterations = 100) Output Parameter | Skewness | SE ------------------------------- Sepal.Length | 0.315 | 0.126 Sepal.Width | 0.319 | 0.175 Petal.Length | -0.275 | 0.137 Petal.Width | -0.103 | 0.134 # kurtosis works with data frames Code kurtosis(iris[, 1:4]) Output Parameter | Kurtosis | SE ------------------------------- Sepal.Length | -0.552 | 0.381 Sepal.Width | 0.228 | 0.381 Petal.Length | -1.402 | 0.381 Petal.Width | -1.341 | 0.381 --- Code kurtosis(iris[, 1:4], iterations = 100) Output Parameter | Kurtosis | SE ------------------------------- Sepal.Length | -0.552 | 0.188 Sepal.Width | 0.228 | 0.351 Petal.Length | -1.402 | 0.167 Petal.Width | -1.341 | 0.115 # skewness works with matrices Code skewness(as.matrix(iris[, 1:4])) Output Parameter | Skewness | SE ------------------------------- Sepal.Length | 0.315 | 0.196 Sepal.Width | 0.319 | 0.196 Petal.Length | -0.275 | 0.196 Petal.Width | -0.103 | 0.196 --- Code skewness(as.matrix(iris[, 1:4]), iterations = 100) Output Parameter | Skewness | SE ------------------------------- Sepal.Length | 0.315 | 0.126 Sepal.Width | 0.319 | 0.175 Petal.Length | -0.275 | 0.137 Petal.Width | -0.103 | 0.134 # kurtosis works with matrices Code kurtosis(as.matrix(iris[, 1:4])) Output Parameter | Kurtosis | SE ------------------------------- Sepal.Length | -0.552 | 0.381 Sepal.Width | 0.228 | 0.381 Petal.Length | -1.402 | 0.381 Petal.Width | -1.341 | 0.381 --- Code kurtosis(as.matrix(iris[, 1:4]), iterations = 100) Output Parameter | Kurtosis | SE ------------------------------- Sepal.Length | -0.552 | 0.188 Sepal.Width | 0.228 | 0.351 Petal.Length | -1.402 | 0.167 Petal.Width | -1.341 | 0.115 ================================================ FILE: tests/testthat/_snaps/smoothness.md ================================================ # smoothness works with data frames Code smoothness(BOD) Output Parameter Time "Time" "0.986393923832144" demand "demand" "0.406270770677043" attr(,"class") [1] "parameters_smoothness" "matrix" "array" ================================================ FILE: tests/testthat/_snaps/text_format.md ================================================ # text formatting helpers work as expected Code text_format(c("A very long First", "Some similar long Second", "Shorter Third", "More or less long Fourth", "And finally the Last"), width = 20) Output [1] "A very long First,\nSome similar long\nSecond, Shorter\nThird, More or less\nlong Fourth and And\nfinally the Last\n" --- Code text_format(c("A very long First", "Some similar long Second", "Shorter Third", "More or less long Fourth", "And finally the Last"), last = " or ", enclose = "`", width = 20) Output [1] "`A very long\nFirst`, `Some\nsimilar long\nSecond`, `Shorter\nThird`, `More or\nless long Fourth`\nor `And finally the\nLast`\n" # text formatters respect `width` argument Code long_text <- strrep("abc ", 100) cat(text_format(long_text, width = 50)) Output abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc Code cat(text_format(long_text, width = 80)) Output abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc Code withr::with_options(list(width = 50), code = { cat(text_format(long_text)) }) Output abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc ================================================ FILE: tests/testthat/_snaps/windows/means_by_group.md ================================================ # mean_by_group, weighted Code means_by_group(efc, "c12hour", "e42dep", weights = "weight") Output # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | 95% CI | p ---------------------------------------------------------------------- independent | 16.92 | 3 | 11.31 | [-60.82, 94.66] | 0.486 slightly dependent | 33.56 | 4 | 29.75 | [-26.93, 94.05] | 0.593 moderately dependent | 52.74 | 26 | 54.44 | [ 28.71, 76.76] | 0.996 severely dependent | 108.08 | 67 | 65.40 | [ 93.01, 123.16] | < .001 Total | 88.11 | 97 | 67.01 | | Anova: R2=0.191; adj.R2=0.165; F=7.329; p<.001 --- Code means_by_group(efc, "c12hour", "e42dep", weights = "weight", ci = NA) Output # Mean of average number of hours of care per week by elder's dependency Category | Mean | N | SD | p --------------------------------------------------- independent | 16.92 | 3 | 11.31 | 0.486 slightly dependent | 33.56 | 4 | 29.75 | 0.593 moderately dependent | 52.74 | 26 | 54.44 | 0.996 severely dependent | 108.08 | 67 | 65.40 | < .001 Total | 88.11 | 97 | 67.01 | Anova: R2=0.191; adj.R2=0.165; F=7.329; p<.001 ================================================ FILE: tests/testthat/_snaps/winsorization.md ================================================ # with missing values Code suppressWarnings(head(winsorize(na.omit(ggplot2::msleep$brainwt)))) Output [1] 0.0155 0.0024 0.1750 0.0700 0.0982 0.1150 ================================================ FILE: tests/testthat/helper-state.R ================================================ testthat::set_state_inspector(function() { # sometimes a dependency might add a custom option, so we need to # make sure we don't fail because of such additions options <- options() # Result of `dput(names(options()))` base_options <- c( "add.smooth", "askpass", "asksecret", "bitmapType", "browser", "browserNLdisabled", "buildtools.check", "buildtools.with", "callr.condition_handler_cli_message", "CBoundsCheck", "check.bounds", "citation.bibtex.max", "connectionObserver", "continue", "contrasts", "defaultPackages", "demo.ask", "deparse.cutoff", "deparse.max.lines", "device", "device.ask.default", "digits", "download.file.method", "dvipscmd", "echo", "editor", "encoding", "example.ask", "expressions", "ggvis.renderer", "help_type", "help.search.types", "help.try.all.packages", "HTTPUserAgent", "install.packages.compile.from.source", "internet.info", "keep.parse.data", "keep.parse.data.pkgs", "keep.source", "keep.source.pkgs", "locatorBell", "mailer", "matprod", "max.contour.segments", "max.print", "menu.graphics", "na.action", "nwarnings", "OutDec", "page_viewer", "pager", "papersize", "PCRE_limit_recursion", "PCRE_study", "PCRE_use_JIT", "pdfviewer", "pkgType", "plumber.docs.callback", "plumber.swagger.url", "printcmd", "profvis.keep_output", "profvis.print", "profvis.prof_extension", "profvis.prof_output", "prompt", "repos", "restart", "reticulate.initialized", "reticulate.repl.busy", "reticulate.repl.hook", "reticulate.repl.initialize", "reticulate.repl.teardown", "rl_word_breaks", "rsconnect.check.certificate", "rstudio.notebook.executing", "RStudioGD.antialias", "RStudioGD.backend", "scipen", "shiny.launch.browser", "shinygadgets.showdialog", "show.coef.Pvalues", "show.error.messages", "show.signif.stars", "showErrorCalls", "showNCalls", "showWarnCalls", "str", "str.dendrogram.last", "terminal.manager", "texi2dvi", "timeout", "ts.eps", "ts.S.compat", "unzip", "useFancyQuotes", "verbose", "viewer", "warn", "warning.length", "warnPartialMatchArgs", "warnPartialMatchAttr", "warnPartialMatchDollar", "width" ) options <- options[base_options] list( attached = search(), connections = nrow(showConnections()), cwd = getwd(), envvars = Sys.getenv(), libpaths = .libPaths(), locale = Sys.getlocale(), options = options, packages = .packages(all.available = TRUE), NULL ) }) ================================================ FILE: tests/testthat/helper.R ================================================ if (insight::check_if_installed("poorman", stop = FALSE)) { `%>%` <- poorman::`%>%` } ================================================ FILE: tests/testthat/test-adjust.R ================================================ test_that("adjust multilevel", { skip_if_not_installed("lme4") adj <- adjust( iris[c("Sepal.Length", "Species")], multilevel = TRUE, bayesian = FALSE ) # High tolerance to avoid issues on some R CMD check specification, see #592 expect_equal( head(adj$Sepal.Length), c(0.08698, -0.11302, -0.31302, -0.41302, -0.01302, 0.38698), tolerance = 1e-1 ) }) test_that("adjust", { adj <- adjust( iris[c("Sepal.Length", "Species")], multilevel = FALSE, bayesian = FALSE ) expect_equal( head(adj$Sepal.Length), c(0.094, -0.106, -0.306, -0.406, -0.006, 0.394), tolerance = 1e-3 ) }) # select helpers ------------------------------ test_that("adjust regex", { expect_identical( adjust(mtcars, select = "pg", regex = TRUE), adjust(mtcars, select = "mpg") ) expect_identical( adjust(mtcars, select = "pg$", regex = TRUE), adjust(mtcars, select = "mpg") ) }) # select helpers ------------------------------ test_that("adjust, invalid column names", { data(iris) colnames(iris)[1] <- "I am" expect_error( adjust(iris[c("I am", "Species")], multilevel = FALSE, bayesian = FALSE), regex = "Bad column names" ) }) ================================================ FILE: tests/testthat/test-assign_labels.R ================================================ test_that("assign_labels, unnamed values", { x <- 1:3 # labelling by providing required number of labels out <- assign_labels( x, variable = "My x", values = c("one", "two", "three") ) expect_identical(attributes(out)$label, "My x") expect_identical( attributes(out)$labels, structure(1:3, names = c("one", "two", "three")) ) }) test_that("assign_labels, named values", { # labelling using named vectors x <- factor(letters[1:3]) out <- assign_labels( x, variable = "Labelled factor", values = c(a = "low", b = "mid", c = "high") ) expect_identical(attributes(out)$label, "Labelled factor") expect_identical(attributes(out)$labels, c(low = "a", mid = "b", high = "c")) }) test_that("assign_labels, partially named values", { x <- 1:5 out <- assign_labels( x, variable = "My x", values = c(`1` = "lowest", `5` = "highest"), verbose = FALSE ) expect_identical(attributes(out)$label, "My x") expect_identical(attributes(out)$labels, c(lowest = 1, highest = 5)) }) test_that("assign_labels, errors", { x <- 1:5 expect_error(assign_labels(x, values = c(`1` = "lowest", `6` = "highest"))) expect_error(assign_labels( x, variable = 1, values = c(`1` = "lowest", `6` = "highest") )) expect_error(assign_labels(x, values = c("a", "b", "c"))) }) test_that("assign_labels, data frame", { data(iris) out <- assign_labels(iris, "Species", values = c("a", "b", "c")) expect_identical( attributes(out$Species)$labels, c(a = "setosa", b = "versicolor", c = "virginica") ) data(mtcars) out <- assign_labels( mtcars, select = c("am", "vs"), values = c("low", "high") ) expect_identical(attributes(out$am)$labels, c(low = 0, high = 1)) expect_identical(attributes(out$vs)$labels, c(low = 0, high = 1)) expect_null(attributes(out$gear)$labels) expect_null(attributes(out$cyl)$labels) }) ================================================ FILE: tests/testthat/test-attributes-grouped-df.R ================================================ # data_arrange ----------------------------------- test_that("data_arrange, attributes preserved", { # if dplyr:::`[.grouped_df` in the environment it destroys the attributes # (only occurs when we run tests in random order) skip_if("[.grouped_df" %in% methods(`[`)) x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_arrange(data_group(x, "cyl"), "hp") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # rescale ----------------------------------- test_that("rescale, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- rescale(data_group(x, "Species"), 1:3) expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # center ----------------------------------- test_that("center, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- center(data_group(x, "Species"), "Sepal.Width") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # categorize ----------------------------------- test_that("categorize, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- categorize(data_group(x, "Species"), "Sepal.Width") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # standardize ----------------------------------- test_that("standardize, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- standardize(data_group(x, "Species"), "Sepal.Width") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # filter ----------------------------------- test_that("filter, attributes preserved", { # if dplyr:::`[.grouped_df` in the environment it destroys the attributes # (only occurs when we run tests in random order) skip_if("[.grouped_df" %in% methods(`[`)) test <- data.frame( id = c(1, 1, 2, 2), x = c(0, 1, 3, 4) ) attr(test, "myattri") <- "I'm here" test2 <- data_filter(data_group(test, "id"), x == min(x)) expect_identical(attr(test2, "myattri", exact = TRUE), "I'm here") }) ================================================ FILE: tests/testthat/test-attributes.R ================================================ data(efc, package = "datawizard") # data_filter ----------------------------------- test_that("data_filter, attributes preserved", { attr(efc, "myattri") <- "I'm here" x <- data_filter(efc, c172code == 1 & c12hour > 40) expect_identical( attr(x$e42dep, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE) ) expect_identical( attr(x, "myattri", exact = TRUE), "I'm here" ) }) # data_arrange ----------------------------------- test_that("data_arrange, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_arrange(x, "hp") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # data_match ----------------------------------- test_that("data_match, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_match(x, data.frame(vs = 0, am = 1)) expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # data_select ----------------------------------- test_that("data_select, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_select(x, "hp") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # data_group ----------------------------------- test_that("data_group, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_group(x, "cyl") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # data_relocate ----------------------------------- test_that("data_relocate, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_relocate(x, "am", "mpg") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # data_remove ----------------------------------- test_that("data_remove, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_remove(x, "am") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # data_reorder ----------------------------------- test_that("data_reorder, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_reorder(x, c("hp", "vs", "wt")) expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # data_to_long ----------------------------------- test_that("data_to_long, attributes preserved", { wide_data <- data.frame(replicate(5, rnorm(10))) attr(wide_data, "myattri") <- "I'm here" x2 <- data_to_long(wide_data) expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # to_numeric ----------------------------------- test_that("to_numeric, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- to_numeric(x, "Species") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # convert_to_na ----------------------------------- test_that("convert_to_na, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- convert_to_na(x, na = 2, verbose = FALSE) expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") # label attribute is preserved attr(x$Species, "label") <- "Species Variable" x2 <- convert_to_na(x, na = "setosa", drop_levels = TRUE, verbose = FALSE) expect_identical(attributes(x$Species)$label, "Species Variable") }) # data_rename ----------------------------------- test_that("data_rename, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- data_rename(x, select = "hp", replacement = "horsepower") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # rescale ----------------------------------- test_that("rescale, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- rescale(x, 1:3) expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # center ----------------------------------- test_that("center, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- center(x, "Sepal.Width") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # categorize ----------------------------------- test_that("categorize, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- categorize(x, "Sepal.Width") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # change_code ----------------------------------- test_that("recode_values, attributes preserved", { x <- mtcars attr(x, "myattri") <- "I'm here" x2 <- recode_values(x, select = "am", recode = list(`5` = 0, `10` = 1)) expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) # standardize ----------------------------------- test_that("standardize, attributes preserved", { x <- iris attr(x, "myattri") <- "I'm here" x2 <- standardize(x, "Sepal.Width") expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") }) ================================================ FILE: tests/testthat/test-categorize.R ================================================ set.seed(123) d <- sample.int(10, size = 500, replace = TRUE) test_that("recode median", { expect_identical(categorize(d), ifelse(d >= median(d), 2, 1)) expect_identical(categorize(d, lowest = 0), as.numeric(d >= median(d))) }) test_that("recode mean", { expect_identical(categorize(d, split = "mean"), ifelse(d >= mean(d), 2, 1)) expect_identical( categorize(d, split = "mean", lowest = 0), as.numeric(d >= mean(d)) ) }) test_that("recode quantile", { expect_error(categorize(d, split = "quantile")) q <- quantile(d, probs = c(1 / 3, 2 / 3, 1)) f <- cut( d, breaks = unique(c(min(d), q, max(d))), include.lowest = TRUE, right = FALSE ) levels(f) <- 1:nlevels(f) expect_identical( categorize(d, split = "quantile", n_groups = 3), as.numeric(f) ) expect_identical( categorize(d, split = "quantile", n_groups = 3, lowest = 0), as.numeric(f) - 1 ) }) set.seed(123) d <- sample.int(100, size = 1000, replace = TRUE) test_that("recode range", { expect_error(categorize(d, split = "range")) d2 <- d d2[d <= 20] <- 1 d2[d > 20 & d <= 40] <- 2 d2[d > 40 & d <= 60] <- 3 d2[d > 60 & d <= 80] <- 4 d2[d > 80] <- 5 expect_equal( table(categorize(d, split = "equal_range", range = 20)), table(d2), ignore_attr = TRUE ) expect_equal( table(categorize( d, split = "equal_range", range = 20, lowest = 1 )), table(d2), ignore_attr = TRUE ) d2 <- d d2[d < 20] <- 0 d2[d >= 20 & d < 40] <- 1 d2[d >= 40 & d < 60] <- 2 d2[d >= 60 & d < 80] <- 3 d2[d >= 80] <- 4 expect_equal( table(categorize( d, split = "equal_range", range = 20, lowest = 0 )), table(d2), ignore_attr = TRUE ) }) test_that("recode length", { expect_error(categorize(d, split = "equal_length")) d2 <- d d2[d <= 20] <- 1 d2[d > 20 & d <= 40] <- 2 d2[d > 40 & d <= 60] <- 3 d2[d > 60 & d <= 80] <- 4 d2[d > 80] <- 5 expect_equal( table(categorize(d, split = "equal_length", n_groups = 5)), table(d2), ignore_attr = TRUE ) expect_equal( table(categorize( d, split = "equal_length", n_groups = 5, lowest = 1 )), table(d2), ignore_attr = TRUE ) }) set.seed(123) x <- sample.int(10, size = 30, replace = TRUE) test_that("recode factor labels", { expect_type(categorize(x, "equal_length", n_groups = 3), "double") expect_s3_class( categorize( x, "equal_length", n_groups = 3, labels = c("low", "mid", "high") ), "factor" ) expect_identical( levels(categorize( x, "equal_length", n_groups = 3, labels = c("low", "mid", "high") )), c("low", "mid", "high") ) t1 <- table(categorize(x, "equal_length", n_groups = 3)) t2 <- table(categorize( x, "equal_length", n_groups = 3, labels = c("low", "mid", "high") )) expect_equal(t1, t2, ignore_attr = TRUE) }) test_that("recode data frame", { data(iris) x <- iris out <- categorize( x, split = "median", select = c("Sepal.Length", "Sepal.Width") ) expect_s3_class(out, "data.frame") expect_identical( out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1) ) expect_identical(out$Petal.Length, iris$Petal.Length) out <- categorize(x, split = "median", select = starts_with("Sepal")) expect_s3_class(out, "data.frame") expect_identical( out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1) ) expect_identical(out$Petal.Length, iris$Petal.Length) out <- categorize(x, split = "median", select = ~ Sepal.Width + Sepal.Length) expect_s3_class(out, "data.frame") expect_identical( out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1) ) expect_identical(out$Petal.Length, iris$Petal.Length) out <- categorize(x, split = "median", select = Sepal.Length) expect_s3_class(out, "data.frame") expect_identical( out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1) ) expect_identical(out$Petal.Length, iris$Petal.Length) expect_warning( expect_warning( out <- categorize( x, split = "median", select = c("sepal.Length", "sepal.Width"), ignore_case = FALSE ), "not found" ), "not found" ) expect_identical(out$Sepal.Length, iris$Sepal.Length) out <- categorize( x, split = "median", select = starts_with("sepal"), ignore_case = TRUE ) expect_s3_class(out, "data.frame") expect_identical( out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1) ) expect_identical(out$Petal.Length, iris$Petal.Length) out <- categorize( x, split = "median", select = starts_with("sepal"), ignore_case = FALSE ) expect_identical(out$Sepal.Length, iris$Sepal.Length) out <- categorize( x, split = "median", select = starts_with("sepal"), ignore_case = TRUE, append = "_r" ) expect_identical( colnames(out), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "Sepal.Length_r", "Sepal.Width_r" ) ) out <- categorize(iris, split = "median", select = starts_with("Sepal")) expect_identical( out$Sepal.Length, c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ) ) skip_if_not_installed("poorman") x <- poorman::group_by(iris, Species) out <- categorize(x, split = "median", select = starts_with("Sepal")) expect_identical( out$Sepal.Length, c( 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1 ) ) }) test_that("recode all NA", { x <- rep(NA, 10) expect_message( y <- categorize(x), "can't be recoded" ) expect_identical(y, x) x <- rep(NA_real_, 10) expect_message( y <- categorize(x), "only missing values" ) expect_identical(y, x) }) test_that("recode numeric", { expect_identical( categorize(mtcars$hp, split = c(100, 150)), c( 2, 2, 1, 2, 3, 2, 3, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 1, 1, 2, 3, 3, 3, 2 ) ) x <- mtcars$hp x[mtcars$hp < 100] <- 1 x[mtcars$hp >= 100 & mtcars$hp < 150] <- 2 x[mtcars$hp >= 150] <- 3 expect_identical(categorize(mtcars$hp, split = c(100, 150)), x) expect_identical(categorize(mtcars$hp, split = c(100, 150), lowest = NULL), x) expect_identical( categorize(mtcars$hp, split = "equal_range", range = 50, lowest = NULL), c( 2, 2, 1, 2, 3, 2, 4, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 1, 2, 2, 4, 3, 1, 1, 2, 5, 3, 6, 2 ) ) }) # select helpers ------------------------------ test_that("categorize regex", { expect_identical( categorize(mtcars, select = "pg", regex = TRUE), categorize(mtcars, select = "mpg") ) }) # labelling ranges ------------------------------ test_that("categorize labelling ranged", { data(mtcars) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5)) expect_snapshot(categorize( mtcars$mpg, "equal_length", n_groups = 5, labels = "range" )) expect_snapshot(categorize( mtcars$mpg, "equal_length", n_groups = 5, labels = "observed" )) }) test_that("categorize breaks", { data(mtcars) expect_snapshot(categorize( mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive" )) expect_error( categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"), regex = "should be one of" ) }) ================================================ FILE: tests/testthat/test-center.R ================================================ test_that("center", { z <- center(iris$Sepal.Width) expect_equal( as.vector(z), iris$Sepal.Width - mean(iris$Sepal.Width), tolerance = 1e-4, ignore_attr = TRUE ) }) test_that("center, robust", { z <- center(mtcars$hp, robust = TRUE) expect_equal( as.vector(z), mtcars$hp - median(mtcars$hp), tolerance = 1e-4, ignore_attr = TRUE ) }) test_that("center, select", { z <- center(iris, select = "Sepal.Width") expect_equal( as.vector(z$Sepal.Width), iris$Sepal.Width - mean(iris$Sepal.Width), tolerance = 1e-4, ignore_attr = TRUE ) # check class attributes expect_identical( vapply(z, class, character(1)), c( Sepal.Length = "numeric", Sepal.Width = "numeric", Petal.Length = "numeric", Petal.Width = "numeric", Species = "factor" ) ) }) test_that("center, factors", { z <- center(iris, select = "Species") expect_identical(z$Species, iris$Species) }) test_that("center, force factors", { z <- center(iris, select = "Species", force = TRUE) v <- as.numeric(iris$Species) expect_equal( as.vector(z$Species), v - median(v), tolerance = 1e-4, ignore_attr = TRUE ) }) test_that("center, all na", { z <- center(c(NA, NA, NA)) expect_identical(z, c(NA, NA, NA)) }) test_that("center, with Inf", { z <- center(c(2, 4, Inf)) expect_equal(z, c(-1, 1, NA), ignore_attr = TRUE) }) test_that("center, all NA or Inf", { z <- center(c(NA, -Inf, Inf)) expect_equal(z, c(NA, -Inf, Inf), ignore_attr = TRUE) }) test_that("center works correctly with only one value", { expect_message( x <- center(100), # nolint "will be set to 0" ) expect_equal(x, 0, ignore_attr = TRUE) expect_equal(center(100, center = 1), 99, ignore_attr = TRUE) expect_equal( center(100, reference = mtcars$mpg), 100 - mean(mtcars$mpg), ignore_attr = TRUE ) }) # with grouped data ------------------------------------------- test_that("center (grouped data)", { skip_if_not_installed("poorman") datawizard <- iris %>% poorman::group_by(Species) %>% center(Sepal.Width) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) manual <- iris %>% poorman::group_by(Species) %>% poorman::mutate(Sepal.Width = Sepal.Width - mean(Sepal.Width)) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) expect_identical(datawizard, manual) }) test_that("center (grouped data), with force = TRUE", { skip_if_not_installed("poorman") datawizard_c <- iris %>% poorman::group_by(Species) %>% center(force = TRUE) %>% poorman::ungroup() manual_c <- iris %>% poorman::group_by(Species) %>% poorman::mutate( Sepal.Length = Sepal.Length - mean(Sepal.Length), Sepal.Width = Sepal.Width - mean(Sepal.Width), Petal.Length = Petal.Length - mean(Petal.Length), Petal.Width = Petal.Width - mean(Petal.Width) ) %>% poorman::ungroup() expect_equal(datawizard_c, manual_c, ignore_attr = TRUE) }) test_that("center, robust (grouped data)", { skip_if_not_installed("poorman") datawizard <- iris %>% poorman::group_by(Species) %>% center(Sepal.Width, robust = TRUE) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) manual <- iris %>% poorman::group_by(Species) %>% poorman::mutate(Sepal.Width = Sepal.Width - median(Sepal.Width)) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) expect_identical(datawizard, manual) }) test_that("center, select (grouped data)", { skip_if_not_installed("poorman") datawizard <- iris %>% poorman::group_by(Species) %>% center(select = starts_with("Sepal\\.W")) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) manual <- iris %>% poorman::group_by(Species) %>% poorman::mutate(Sepal.Width = Sepal.Width - mean(Sepal.Width)) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) expect_identical(datawizard, manual) }) test_that("center, factors (grouped data)", { skip_if_not_installed("poorman") datawizard <- iris %>% poorman::group_by(Species) %>% center(select = "Species") %>% poorman::ungroup() %>% poorman::pull(Species) manual <- poorman::pull(iris, Species) expect_identical(datawizard, manual) }) # select helpers ------------------------------ test_that("center regex", { expect_equal( center(mtcars, select = "pg", regex = TRUE)$mpg, center(mtcars$mpg), ignore_attr = TRUE ) expect_equal( center(mtcars, select = "pg$", regex = TRUE)$mpg, center(mtcars$mpg), ignore_attr = TRUE ) }) # no matches ------------------------------ test_that("center no match", { data(iris) expect_warning(center(iris, "Sepla.Length")) }) ================================================ FILE: tests/testthat/test-coef_var.R ================================================ test_that("coefficient of variation works", { expect_equal(coef_var(1:10), 0.5504818826) expect_equal(coef_var(1:10, method = "unbiased"), 0.5552700246) expect_equal(coef_var(c(1:10, 100), method = "median_mad"), 0.7413) expect_equal(coef_var(c(1:10, 100), method = "qcd"), 0.4166666667) expect_identical(coef_var(mu = 10, sigma = 20), 2) expect_equal( coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30), 2.250614348 ) expect_equal(distribution_coef_var(1:10), 0.5504818826) }) test_that("coef_var returns NULL if can't compute", { expect_warning( { x <- coef_var(as.Date("2022-10-31")) }, "Can't compute" ) expect_null(x) }) test_that("coef_var: argument 'remove_na' works", { expect_identical(coef_var(c(1:10, NA)), NA_real_) expect_identical( coef_var(1:10), coef_var(c(1:10, NA), remove_na = TRUE) ) }) test_that("coef_var: method 'unbiased' needs argument 'n' when sigma and mu are provided", { expect_error( coef_var(1:10, method = "unbiased", mu = 10, sigma = 20), "A value for `n` must be provided" ) }) ================================================ FILE: tests/testthat/test-contr.deviation.R ================================================ test_that("contr.deviation", { c.treatment <- solve(cbind(Intercept = 1, contr.treatment(3))) c.sum <- solve(cbind(Intercept = 1, contr.sum(3))) c.deviation <- solve(cbind(Intercept = 1, contr.deviation(3))) expect_equal(c.deviation[1, ], c.sum[1, ]) expect_equal(c.deviation[-1, ], c.treatment[-1, ]) }) test_that("contr.deviation | snapshot", { skip_if_not_installed("base", "4.3") # IF THIS TESTS FAILS, UPDATE THE EXAMPLE data("mtcars") mtcars <- data_modify(mtcars, cyl = factor(cyl)) mtcars <- data_modify(mtcars, am = factor(am)) mtcars <- data_arrange(mtcars, select = c("cyl", "am")) contrasts(mtcars$cyl) <- contr.deviation c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl)) expect_snapshot(solve(c.deviation)) mm <- unique(model.matrix(~ cyl * am, data = mtcars)) rownames(mm) <- c( "cyl4.am0", "cyl4.am1", "cyl6.am0", "cyl6.am1", "cyl8.am0", "cyl8.am1" ) expect_snapshot(solve(mm)) }) ================================================ FILE: tests/testthat/test-convert_na_to.R ================================================ # numeric -------------------------- test_that("convert_na_to - numeric: works", { expect_identical( convert_na_to(c(1, 2, 3, NA), replacement = 4), as.double(1:4) ) expect_warning( expect_identical( convert_na_to(c(1, 2, 3, NA), replacement = NULL), c(1, 2, 3, NA) ), "needs to be a numeric" ) }) test_that("convert_na_to - numeric: arg 'replacement' can only be numeric", { expect_warning( convert_na_to(c(1, 2, 3, NA), replacement = "a"), regexp = "`replacement` needs to be a numeric vector." ) expect_warning( convert_na_to(c(1, 2, 3, NA), replacement = factor(8)), regexp = "`replacement` needs to be a numeric vector." ) }) test_that("convert_na_to - numeric: arg 'replacement' must be of length one", { expect_warning( convert_na_to(c(1, 2, 3, NA), replacement = c(1, 2)), regexp = "`replacement` needs to be of length one." ) }) test_that("convert_na_to - numeric: returns original vector if 'replacement' not good", { expect_warning( expect_identical( convert_na_to(c(1, 2, 3, NA), replacement = "a"), c(1, 2, 3, NA) ), "needs to be a numeric" ) expect_warning( expect_identical( convert_na_to(c(1, 2, 3, NA), replacement = factor(8)), c(1, 2, 3, NA) ), "needs to be a numeric" ) }) # character -------------------------- test_that("convert_na_to - character: works", { expect_identical( convert_na_to(c("a", "b", "c", NA), replacement = "d"), c("a", "b", "c", "d") ) expect_warning( expect_identical( convert_na_to(c("a", "b", "c", NA), replacement = NULL), c("a", "b", "c", NA) ), "needs to be a character" ) }) test_that("convert_na_to - character: arg 'replacement' can only be character", { expect_warning( convert_na_to(c("a", "b", "c", NA), replacement = mtcars), regexp = "`replacement` needs to be a character or numeric vector." ) expect_warning( convert_na_to(c("a", "b", "c", NA), replacement = factor(8)), regexp = "`replacement` needs to be a character or numeric vector." ) }) test_that("convert_na_to - numeric: arg 'replacement' must be of length one", { expect_warning( convert_na_to(c("a", "b", "c", NA), replacement = c("d", "e")), regexp = "`replacement` needs to be of length one." ) }) test_that("convert_na_to - character: returns original vector if 'replacement' not good", { expect_identical( convert_na_to(c("a", "b", "c", NA), replacement = 1), c("a", "b", "c", 1) ) expect_warning( expect_identical( convert_na_to(c("a", "b", "c", NA), replacement = mtcars), c("a", "b", "c", NA) ), "needs to be a character or numeric vector" ) expect_warning( expect_identical( convert_na_to(c("a", "b", "c", NA), replacement = factor(8)), c("a", "b", "c", NA) ), "needs to be a character or numeric vector" ) }) # factor -------------------------- test_that("convert_na_to - factor: works when 'replacement' is numeric ", { x <- convert_na_to(factor(c(1, 2, 3, NA)), replacement = 4) expect_identical( x, factor(1:4) ) expect_identical(levels(x), as.character(1:4)) expect_warning( expect_identical( convert_na_to(factor(c(1, 2, 3, NA)), replacement = NULL), factor(c(1, 2, 3, NA)) ), "needs to be of length one" ) }) test_that("convert_na_to - factor: works when 'replacement' is character", { x <- convert_na_to(factor(c(1, 2, 3, NA)), replacement = "d") expect_identical( x, factor(c(1:3, "d")) ) expect_identical(levels(x), as.character(c(1:3, "d"))) }) # data frame -------------------------- test <- data.frame( x = c(1, 2, NA), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, NA), stringsAsFactors = FALSE ) test_that("convert_na_to - data frame: works with replace_* args", { expect_identical( convert_na_to(test, replace_num = 4, replace_char = "e", replace_fac = 8), data.frame( x = c(1, 2, 4), y = c("a", "b", "e"), z = factor(c("a", "b", "8"), levels = c("a", "b", "8")), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) }) test_that("convert_na_to - data frame: only modifies numeric if only numeric specified", { expect_identical( convert_na_to(test, replace_num = 4), data.frame( x = c(1, 2, 4), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) }) test_that("convert_na_to - data frame: only modifies character if only character specified", { expect_identical( convert_na_to(test, replace_char = "e"), data.frame( x = c(1, 2, NA), y = c("a", "b", "e"), z = factor(c("a", "b", NA)), x2 = c(4, 5, NA), stringsAsFactors = FALSE ) ) }) test_that("convert_na_to - data frame: only modifies factor if only factor specified", { expect_identical( convert_na_to(test, replace_fac = 8), data.frame( x = c(1, 2, NA), y = c("a", "b", NA), z = factor(c("a", "b", "8"), levels = c("a", "b", "8")), x2 = c(4, 5, NA), stringsAsFactors = FALSE ) ) }) test_that("convert_na_to - data frame: arg 'select' works", { expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, select = "x" ), data.frame( x = c(1, 2, 4), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, NA), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, select = ~x ), data.frame( x = c(1, 2, 4), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, NA), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, select = starts_with("x") ), data.frame( x = c(1, 2, 4), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, select = ends_with("2") ), data.frame( x = c(1, 2, NA), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, select = contains("x") ), data.frame( x = c(1, 2, 4), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, select = 1:3 ), data.frame( x = c(1, 2, 4), y = c("a", "b", "e"), z = factor(c("a", "b", "8"), levels = c("a", "b", "8")), x2 = c(4, 5, NA), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, select = regex("2$") ), data.frame( x = c(1, 2, NA), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) }) test_that("convert_na_to - data frame: arg 'exclude' works", { expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, exclude = "x" ), data.frame( x = c(1, 2, NA), y = c("a", "b", "e"), z = factor(c("a", "b", "8"), levels = c("a", "b", "8")), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, exclude = ~x ), data.frame( x = c(1, 2, NA), y = c("a", "b", "e"), z = factor(c("a", "b", "8"), levels = c("a", "b", "8")), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to( test, replace_num = 4, replace_char = "e", replace_fac = 8, select = starts_with("x"), exclude = ~x ), data.frame( x = c(1, 2, NA), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) }) test_that("convert_na_to - data frame: works when arg 'select' is a list", { # numeric expect_identical( convert_na_to(test, replace_num = 4, select = list(x = 0)), data.frame( x = c(1, 2, 0), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) # character expect_identical( convert_na_to(test, replace_char = "e", select = list(y = "d")), data.frame( x = c(1, 2, NA), y = c("a", "b", "d"), z = factor(c("a", "b", NA)), x2 = c(4, 5, NA), stringsAsFactors = FALSE ) ) # only named list can override replace_* expect_identical( convert_na_to(test, replace_num = 4, select = list(0)), data.frame( x = c(1, 2, 4), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ) expect_identical( convert_na_to(test, replace_char = "e", select = list("d")), data.frame( x = c(1, 2, NA), y = c("a", "b", "e"), z = factor(c("a", "b", NA)), x2 = c(4, 5, NA), stringsAsFactors = FALSE ) ) # no problem if put a variable that doesn't exist in list expect_warning( expect_identical( convert_na_to(test, replace_num = 4, select = list(x = 0, foo = 5)), data.frame( x = c(1, 2, 0), y = c("a", "b", NA), z = factor(c("a", "b", NA)), x2 = c(4, 5, 4), stringsAsFactors = FALSE ) ), "not found" ) }) # preserve attributes -------------------------- test_that("data_rename preserves attributes", { skip_if_not_installed("parameters") m <- lm(Sepal.Length ~ Species, data = iris) out <- parameters::parameters(m) a1 <- attributes(out) out2 <- convert_na_to(out, replace_num = 5) a2 <- attributes(out2) expect_identical(names(a1)[1:28], names(a2)[1:28]) }) # select helpers ------------------------------ test_that("convert_na_to regex", { expect_identical( convert_na_to(airquality, replacement = 0, select = "zone", regex = TRUE), convert_na_to(airquality, replacement = 0, select = "Ozone") ) expect_identical( convert_na_to(airquality, replacement = 0, select = "zone$", regex = TRUE), convert_na_to(airquality, replacement = 0, select = "Ozone") ) }) ================================================ FILE: tests/testthat/test-convert_to_na.R ================================================ data(iris) test_that("convert_to_na-factor", { x <- convert_to_na(iris$Species, na = "versicolor") expect_identical(sum(is.na(x)), 50L) x <- convert_to_na(iris$Species, na = list(2, "versicolor")) expect_identical(sum(is.na(x)), 50L) x <- convert_to_na( iris$Species, na = list(2, "versicolor"), drop_levels = FALSE ) expect_identical(levels(x), c("setosa", "versicolor", "virginica")) expect_identical(as.vector(table(x)), c(50L, 0L, 50L)) x <- convert_to_na( iris$Species, na = list(2, "versicolor"), drop_levels = TRUE ) expect_identical(levels(x), c("setosa", "virginica")) expect_identical(as.vector(table(x)), c(50L, 50L)) expect_message( x <- convert_to_na(iris$Species, na = 2), # nolint "for a factor or character variable" ) expect_identical(sum(is.na(x)), 0L) }) test_that("convert_to_na-numeric", { x <- convert_to_na(iris$Sepal.Length, na = 5) expect_identical(sum(is.na(x)), sum(iris$Sepal.Length == 5)) x <- convert_to_na(iris$Sepal.Length, na = list(5, "versicolor")) expect_identical(sum(is.na(x)), 10L) x <- convert_to_na(iris$Sepal.Width, na = "a", verbose = FALSE) expect_message( convert_to_na(iris$Sepal.Width, na = "a"), "needs to be a numeric vector" ) expect_identical(sum(is.na(x)), 0L) }) test_that("convert_to_na-df", { expect_message( x <- convert_to_na(iris, na = 5), # nolint "needs to be a character vector" ) expect_identical( sum(is.na(x)), sum(vapply(iris, function(i) sum(i == 5), FUN.VALUE = integer(1L))) ) x <- convert_to_na(iris, na = list(5, "versicolor")) expect_identical(sum(is.na(x)), 64L) data(iris) expect_message( x <- convert_to_na(iris, na = 3), # nolint "needs to be a character vector" ) expect_identical( sum(is.na(x)), sum(vapply( iris, function(i) { if (is.numeric(i)) { sum(i == 3) } else { 0L } }, FUN.VALUE = integer(1L) )) ) x <- convert_to_na(iris, na = list(3, "3")) expect_identical(sum(is.na(x)), 27L) }) test_that("convert_to_na other classes", { d <- data.frame( a = 1:5, b = factor(letters[1:5]), c = as.Date(c( "2022-03-22", "2022-01-02", "2022-02-02", "2021-04-02", "2020-01-19" )), d = c(TRUE, TRUE, FALSE, FALSE, TRUE), e = as.complex(1:5) ) x <- convert_to_na(d$a, na = 3) expect_equal(x, c(1, 2, NA, 4, 5), tolerance = 1e-3, ignore_attr = TRUE) expect_message( x <- convert_to_na(d$a, na = "c"), "needs to be a numeric vector" ) # nolint expect_equal(x, 1:5, tolerance = 1e-3, ignore_attr = TRUE) x <- convert_to_na(d$b, na = "c") expect_equal( x, structure( c(1L, 2L, NA, 4L, 5L), .Label = c("a", "b", "c", "d", "e"), class = "factor" ), tolerance = 1e-3, ignore_attr = TRUE ) x <- convert_to_na(d$b, na = "c", drop_levels = TRUE) expect_equal( x, structure( c(1L, 2L, NA, 3L, 4L), .Label = c("a", "b", "d", "e"), class = "factor" ), tolerance = 1e-3, ignore_attr = TRUE ) expect_message( convert_to_na(d$c, na = "2022-03-22"), "of class 'Date'" ) x <- convert_to_na(d$c, na = as.Date("2022-03-22")) expect_equal( x, structure(c(NA, 18994, 19025, 18719, 18280), class = "Date"), tolerance = 1e-3, ignore_attr = TRUE ) x <- convert_to_na(d$d, na = TRUE) expect_equal( x, c(NA, NA, FALSE, FALSE, NA), tolerance = 1e-3, ignore_attr = TRUE ) expect_message( x <- convert_to_na(d$e, na = as.complex(4)), # nolint "variables of class `complex`" ) expect_equal(x, d$e, tolerance = 1e-3, ignore_attr = TRUE) out <- data.frame( a = c(1, 2, NA, 4, 5), b = factor(c("a", "b", NA, "d", "e"), levels = letters[1:5]), c = as.Date(c( "2022-03-22", "2022-01-02", "2022-02-02", "2021-04-02", "2020-01-19" )), d = c(NA, NA, FALSE, FALSE, NA), e = as.complex(1:5) ) convert_to_na(d, na = list(3, "c", TRUE, "2022-01-02"), verbose = FALSE) x <- convert_to_na( d, na = list(3, "c", TRUE, as.Date("2022-01-02")), verbose = FALSE ) expect_equal(x, out, ignore_attr = TRUE, tolerance = 1e-3) }) # select helpers ------------------------------ test_that("convert_to_na regex", { expect_identical( convert_to_na(mtcars, na = 4, select = "arb", regex = TRUE), convert_to_na(mtcars, na = 4, select = "carb") ) expect_identical( convert_to_na(mtcars, na = 4, select = "arb$", regex = TRUE), convert_to_na(mtcars, na = 4, select = "carb") ) }) ================================================ FILE: tests/testthat/test-data_addprefix.R ================================================ test_that("data_addprefix works as expected", { expect_equal( names(head(data_addprefix(iris, "NEW_"))), c( "NEW_Sepal.Length", "NEW_Sepal.Width", "NEW_Petal.Length", "NEW_Petal.Width", "NEW_Species" ) ) expect_equal( names(head(data_addsuffix(iris, "_OLD"))), c( "Sepal.Length_OLD", "Sepal.Width_OLD", "Petal.Length_OLD", "Petal.Width_OLD", "Species_OLD" ) ) expect_equal( names(head(data_addprefix(iris, "NEW_", select = starts_with("Sepal")))), c( "NEW_Sepal.Length", "NEW_Sepal.Width", "Petal.Length", "Petal.Width", "Species" ) ) expect_equal( names(head(data_addsuffix(iris, "_OLD", select = starts_with("Petal")))), c( "Sepal.Length", "Sepal.Width", "Petal.Length_OLD", "Petal.Width_OLD", "Species" ) ) }) # select helpers ------------------------------ test_that("data_addprefix regex", { expect_equal( data_addsuffix(mtcars, "_regex", select = "pg", regex = TRUE), data_addsuffix(mtcars, "_regex", select = "mpg") ) expect_equal( data_addsuffix(mtcars, select = "pg$", "_regex", regex = TRUE), data_addsuffix(mtcars, select = "mpg", "_regex") ) }) ================================================ FILE: tests/testthat/test-data_arrange.R ================================================ df <- head(mtcars) df$character <- c("a", "b", "b", "c", "c", "a") test_that("data_arrange works with one numeric column", { skip_if_not_installed("poorman") expect_identical( poorman::arrange(df, carb), data_arrange(df, "carb") ) expect_identical( poorman::arrange(df, -carb), data_arrange(df, "-carb") ) }) test_that("data_arrange works with one character column", { skip_if_not_installed("poorman") expect_identical( poorman::arrange(df, character), data_arrange(df, "character") ) expect_identical( poorman::arrange(df, desc(character)), data_arrange(df, "-character") ) }) test_that("data_arrange works with several columns", { skip_if_not_installed("poorman") expect_identical( poorman::arrange(df, carb, gear), data_arrange(df, c("carb", "gear")) ) expect_identical( poorman::arrange(df, -carb, gear), data_arrange(df, c("-carb", "gear")) ) expect_identical( poorman::arrange(df, -carb, desc(character)), data_arrange(df, c("-carb", "-character")) ) }) test_that("data_arrange works without columns", { expect_identical(data_arrange(df), df) }) test_that("data_arrange ignores wrong names if safe = TRUE", { expect_warning( expect_identical(data_arrange(df, "foo"), df), regexp = "don't exist" ) expect_warning( expect_identical( data_arrange(df, c("gear", "foo")), data_arrange(df, "gear") ), regexp = "don't exist" ) }) test_that("data_arrange errors if safe = FALSE", { expect_error(data_arrange(df, "foo", safe = FALSE)) }) test_that("data_arrange errors if not coercable to data frame", { expect_error(data_arrange(list(a = 1:5, b = letters[1:3]), select = "b")) expect_equal( data_arrange(list(a = 1:5, b = letters[5:1]), select = "b"), structure( list(a = 5:1, b = c("a", "b", "c", "d", "e")), row.names = 5:1, class = "data.frame" ), ignore_attr = TRUE ) }) test_that("data_arrange works with grouped df", { set.seed(123) x <- mtcars[ sample(seq_len(nrow(mtcars)), 10, replace = TRUE), c("cyl", "mpg") ] g <- data_group(x, cyl) expected <- data.frame( cyl = c(4, 4, 4, 6, 6, 8, 8, 8, 8, 8), mpg = c(22.8, 30.4, 32.4, 17.8, 19.2, 10.4, 15, 15.2, 15.5, 18.7) ) class(expected) <- c("grouped_df", "data.frame") rownames(expected) <- c( "Datsun 710", "Honda Civic", "Fiat 128", "Merc 280C", "Merc 280", "Cadillac Fleetwood", "Maserati Bora", "Merc 450SLC", "Dodge Challenger", "Hornet Sportabout" ) attributes(expected)$groups <- attributes(g)$groups expect_identical( data_arrange(g, "mpg"), expected, ignore_attr = TRUE ) }) test_that("data_arrange works with NA", { # without groups tmp <- data.frame( a = c(1, 2, 2, 8, 1, 3), b = c(1, NA, 3, 3, NA, 5) ) expect_identical( data_arrange(tmp, "a"), data.frame( a = c(1, 1, 2, 2, 3, 8), b = c(1, NA, NA, 3, 5, 3) ) ) # with groups g <- data_group(tmp, "b") expected <- data.frame( a = c(1, 2, 8, 3, 1, 2), b = c(1, 3, 3, 5, NA, NA) ) class(expected) <- c("grouped_df", "data.frame") attributes(expected)$groups <- attributes(g)$groups expect_identical( data_arrange(g, "a"), expected, ignore_attr = TRUE ) }) test_that("data_arrange works one-column data frames (and does not drop dimensions)", { data(mtcars) expect_s3_class(data_arrange(mtcars["gear"], select = "gear"), "data.frame") expect_s3_class( data_arrange(mtcars[c("gear", "cyl")], select = "gear"), "data.frame" ) }) ================================================ FILE: tests/testthat/test-data_codebook.R ================================================ data(efc) data(iris) test_that("data_codebook iris", { expect_snapshot(data_codebook(iris)) }) test_that("data_codebook iris, reordered", { expect_snapshot(data_codebook(iris[c(1, 2, 5, 3, 4)])) }) test_that("data_codebook NaN and Inf", { d <- data.frame( x = c(1, 4, NA, Inf, 4, NaN, 2, 1, 1) ) expect_snapshot(data_codebook(d)) set.seed(123) d <- data.frame( x = c(sample.int(15, 100, TRUE), Inf, Inf) ) expect_snapshot(data_codebook(d)) expect_snapshot(data_codebook(d, range_at = 100)) expect_snapshot(data_codebook(d, range_at = 100, max_values = 4)) }) test_that("data_codebook, tinytable", { skip_if_not_installed("tinytable") d <- data.frame( x = c(1, 4, NA, Inf, 4, NaN, 2, 1, 1) ) expect_snapshot(display(data_codebook(d), format = "tt")) set.seed(123) d <- data.frame( x = c(sample.int(15, 100, TRUE), Inf, Inf) ) expect_snapshot(display(data_codebook(d), format = "tt")) expect_snapshot(display(data_codebook(d, range_at = 100), format = "tt")) expect_snapshot(display( data_codebook(d, range_at = 100, max_values = 4), format = "tt" )) data(iris) expect_snapshot(display(data_codebook(iris[c(1, 2, 5, 3, 4)]), format = "tt")) }) test_that("data_codebook iris, select", { expect_snapshot(data_codebook(iris, select = starts_with("Sepal"))) }) test_that("data_codebook iris, select, ID", { expect_snapshot(data_codebook(iris, select = starts_with("Petal"))) }) test_that("data_codebook efc", { expect_snapshot(print(data_codebook(efc), table_width = Inf)) expect_snapshot(print( data_codebook(efc), table_width = "auto", remove_duplicates = FALSE )) expect_snapshot(print( data_codebook(efc), table_width = "auto", remove_duplicates = TRUE )) }) test_that("data_codebook efc, variable_label_width", { out <- data_codebook(efc, variable_label_width = 30) expect_snapshot(print(out, table_width = Inf)) expect_snapshot(print(out, table_width = "auto", remove_duplicates = FALSE)) expect_snapshot(print(out, table_width = "auto", remove_duplicates = TRUE)) }) test_that("data_codebook efc, value_label_width", { out <- data_codebook(efc, variable_label_width = 30, value_label_width = 15) expect_snapshot(print(out, table_width = Inf)) expect_snapshot(print(out, table_width = "auto", remove_duplicates = FALSE)) expect_snapshot(print(out, table_width = "auto", remove_duplicates = TRUE)) }) test_that("data_codebook truncated data", { set.seed(123) d <- data.frame( a = sample.int(15, 100, TRUE), b = sample(letters[1:18], 100, TRUE), stringsAsFactors = FALSE ) expect_snapshot(data_codebook(d, max_values = 5)) }) test_that("data_codebook mixed numeric lengths", { set.seed(123) d <- data.frame( a = sample.int(4, 100, TRUE), b = sample(5:15, 100, TRUE), stringsAsFactors = FALSE ) expect_snapshot(data_codebook(d)) }) test_that("data_codebook mixed range_at", { set.seed(123) d <- data.frame( a = sample.int(4, 100, TRUE), b = sample(5:15, 100, TRUE), stringsAsFactors = FALSE ) expect_snapshot(data_codebook(d, range_at = 3)) }) test_that("data_codebook logicals", { set.seed(123) d <- data.frame( a = sample.int(15, 100, TRUE), b = sample(letters[1:3], 100, TRUE), c = sample(c(TRUE, FALSE), 100, TRUE), stringsAsFactors = FALSE ) expect_snapshot(data_codebook(d)) }) test_that("data_codebook labelled data exceptions", { set.seed(123) f1 <- sample.int(5, 100, TRUE) f1[f1 == 4] <- NA attr(f1, "labels") <- setNames(1:5, c("One", "Two", "Three", "Four", "Five")) f2 <- sample.int(5, 100, TRUE) attr(f2, "labels") <- setNames(c(1:3, 5), c("One", "Two", "Three", "Five")) f3 <- sample.int(5, 100, TRUE) attr(f3, "labels") <- setNames(1:5, c("One", "Two", "Three", "Four", "Five")) d <- data.frame(f1, f2, f3) expect_snapshot(data_codebook(d)) }) test_that("data_codebook labelled data factors", { set.seed(123) f1 <- factor(sample(c("c", "b", "a"), 100, TRUE)) attr(f1, "labels") <- setNames(c("c", "b", "a"), c("Cee", "Bee", "A")) f2 <- factor(sample(c("a", "b", "c"), 100, TRUE)) attr(f2, "labels") <- setNames(c("c", "b", "a"), c("Cee", "Bee", "A")) f3 <- factor(sample(c("c", "b", "a"), 100, TRUE)) attr(f3, "labels") <- setNames(c("a", "c", "b"), c("A", "Cee", "Bee")) d <- data.frame(f1, f2, f3) expect_snapshot(data_codebook(d)) }) test_that("data_codebook works with numbers < 1", { d <- data.frame( a = c(1, 1, 2, 2, 3, 3), b = c(0, 0, 0, 1, 1, 2) ) expect_snapshot(data_codebook(d)) }) test_that("data_codebook, big marks", { set.seed(123) f1 <- factor(sample(c("c", "b", "a"), 1e6, TRUE)) f2 <- factor(sample.int(3, 1e6, TRUE)) d <- data.frame(f1, f2) expect_snapshot(data_codebook(d)) }) test_that("data_codebook, tagged NA", { skip_if_not_installed("haven") x <- haven::labelled( x = c( 1:3, haven::tagged_na("a", "c", "z"), 4:1, haven::tagged_na("a", "a", "c"), 1:3, haven::tagged_na("z", "c", "c"), 1:4, haven::tagged_na("a", "c", "z") ), labels = c( Agreement = 1, Disagreement = 4, First = haven::tagged_na("c"), Refused = haven::tagged_na("a"), `Not home` = haven::tagged_na("z") ) ) expect_snapshot(data_codebook(data.frame(x))) x <- haven::labelled( x = c( 1:3, haven::tagged_na("a", "c"), 4:1, haven::tagged_na("a", "a", "c"), 1:3, haven::tagged_na("c", "c"), 1:4, haven::tagged_na("a", "c") ), labels = c( Agreement = 1, Disagreement = 4, First = haven::tagged_na("c"), Refused = haven::tagged_na("a"), `Not home` = haven::tagged_na("z") ) ) expect_snapshot(data_codebook(data.frame(x))) }) test_that("data_codebook, negative label values #334", { skip_if_not_installed("haven") x1 <- haven::labelled( x = 1:4, labels = c(Agreement = 1, Disagreement = 4, Missing = -9) ) x2 <- haven::labelled( x = c(1:3, -9), labels = c(Agreement = 1, Disagreement = 4, Missing = -9) ) expect_snapshot(data_codebook(data.frame(x1, x2))) }) test_that("data_codebook, informative warning if no match", { data(iris) expect_warning( data_codebook(iris, select = starts_with("abc")), regex = "No column names that matched" ) }) ================================================ FILE: tests/testthat/test-data_duplicated.R ================================================ # Preparations df1 <- data.frame( id = c(1, 2, 3, 1, 3), year = c(2022, 2022, 2022, 2022, 2000), item1 = c(NA, 1, 1, 2, 3), item2 = c(NA, 1, 1, 2, 3), item3 = c(NA, 1, 1, 2, 3) ) expected1 <- data.frame( Row = c(1, 4, 3, 5), id = c(1, 1, 3, 3), year = c(2022, 2022, 2022, 2000), item1 = c(NA, 2, 1, 3), item2 = c(NA, 2, 1, 3), item3 = c(NA, 2, 1, 3), count_na = c(3, 0, 0, 0) ) expected2 <- data.frame( Row = c(1, 4), id = c(1, 1), year = c(2022, 2022), item1 = c(NA, 2), item2 = c(NA, 2), item3 = c(NA, 2), count_na = c(3, 0) ) # Testing test_that("data_duplicated basic", { x <- data_duplicated(df1, select = "id") rownames(x) <- NULL expect_equal( x, expected1 ) }) test_that("data_duplicated unquoted", { x <- data_duplicated(df1, select = id) rownames(x) <- NULL expect_equal( x, expected1 ) }) test_that("data_duplicated vector", { x <- data_duplicated(df1, select = 1) rownames(x) <- NULL expect_equal( x, expected1 ) }) test_that("data_duplicated select-helper", { x <- data_duplicated(df1, select = contains("id")) rownames(x) <- NULL expect_equal( x, expected1 ) }) test_that("data_duplicated multiple IDs", { x <- data_duplicated(df1, select = c("id", "year")) rownames(x) <- NULL expect_equal( x, expected2 ) }) test_that("data_duplicated multiple IDs formula", { x <- data_duplicated(df1, select = ~ id + year) rownames(x) <- NULL expect_equal( x, expected2 ) }) test_that("data_duplicated multiple IDs vector", { x <- data_duplicated(df1, select = 1:2) rownames(x) <- NULL expect_equal( x, expected2 ) }) test_that("data_unique works with groups", { df <- data.frame( g = c(1, 1, 2, 2), x = c(1, 1, 2, 1) ) df <- data_group(df, "g") expected <- data.frame( Row = 1:2, g = c(1, 1), x = c(1, 1), count_na = c(0, 0) ) expected <- data_group(expected, "g") expect_identical(data_duplicated(df, "x"), expected, ignore_attr = TRUE) }) ================================================ FILE: tests/testthat/test-data_extract.R ================================================ data(efc) test_that("data_extract works with select-length > 1", { # works with multiple selects expect_s3_class( data_extract(efc, select = c("e42dep", "c172code")), "data.frame" ) # colnames properly set expect_named( data_extract(efc, select = c("e42dep", "c172code")), c("e42dep", "c172code") ) # properly extract vector, w/o naming expect_identical(data_extract(efc, select = "e42dep"), efc$e42dep) # properly extract vector, with naming x <- data_extract(efc, select = "e42dep", name = "c172code") expect_named(x, as.character(efc$c172code)) }) test_that("data_extract works with select-helpers", { expect_identical( data_extract(iris, starts_with("Sepal")), iris[c("Sepal.Length", "Sepal.Width")] ) expect_identical( data_extract(iris, 1:3), iris[1:3] ) expect_identical( data_extract(iris, "Species"), iris$Species ) expect_identical( data_extract(iris, contains("Wid")), iris[c("Sepal.Width", "Petal.Width")] ) expect_identical( data_extract(iris, Sepal.Width), iris$Sepal.Width ) }) test_that("data_extract works with formulas", { expect_identical( data_extract(iris, ~ Sepal.Width + Species), iris[c("Sepal.Width", "Species")] ) }) test_that("data_extract from other functions", { test_fun <- function(data, i) { data_extract(data, select = i) } expect_identical( test_fun(iris, c("Sepal.Length", "Sepal.Width")), iris[c("Sepal.Length", "Sepal.Width")] ) }) test_that("data_extract extract, pull", { expect_identical( data_extract(iris, starts_with("Sepal")), iris[c("Sepal.Length", "Sepal.Width")] ) expect_identical( data_extract(iris, starts_with("Sepal"), extract = "first"), iris$Sepal.Length ) expect_identical( data_extract(iris, starts_with("Sepal"), extract = "last"), iris$Sepal.Width ) expect_identical( data_extract( iris, starts_with("Sepal"), extract = "last", as_data_frame = TRUE ), iris["Sepal.Width"] ) expect_identical( colnames(data_extract(mtcars, contains("a"))), c("drat", "am", "gear", "carb") ) expect_identical( colnames(data_extract(mtcars, contains("a"), extract = "odd")), c("drat", "gear") ) expect_identical( colnames(data_extract(mtcars, contains("a"), extract = "even")), c("am", "carb") ) expect_identical( colnames(data_extract(mtcars, cyl:drat)), c("cyl", "disp", "hp", "drat") ) expect_error(colnames(data_extract(mtcars, Cyl:Drat))) expect_identical( colnames(data_extract(mtcars, Cyl:Drat, ignore_case = TRUE)), c("cyl", "disp", "hp", "drat") ) expect_identical( colnames(data_extract(iris, contains("Sep"))), c("Sepal.Length", "Sepal.Width") ) expect_null(colnames(data_extract(iris, contains("sep")))) expect_identical( colnames(data_extract(iris, contains("sep"), ignore_case = TRUE)), c("Sepal.Length", "Sepal.Width") ) expect_identical( colnames(data_extract(iris, c(1:2, 5))), c("Sepal.Length", "Sepal.Width", "Species") ) }) # select helpers ------------------------------ test_that("data_extract regex", { expect_identical( data_extract(mtcars, select = "pg", regex = TRUE), data_extract(mtcars, select = "mpg") ) expect_identical( data_extract(mtcars, select = "pg$", regex = TRUE), data_extract(mtcars, select = "mpg") ) }) test_that("data_extract: 'name' is numeric", { expect_identical( data_extract(mtcars, "gear", 1), data_extract(mtcars, "gear", "mpg") ) expect_identical( data_extract(mtcars, "gear", -2), data_extract(mtcars, "gear", "gear") ) expect_identical( data_extract(mtcars, "gear", 0), data_extract(mtcars, "gear", "row.names") ) }) ================================================ FILE: tests/testthat/test-data_group.R ================================================ data(efc) test_that("data_group attributes", { x <- data_group(efc, "c172code") expect_identical( attributes(x)$groups, structure( list( c172code = c(1, 2, 3, NA), .rows = list( c(3L, 14L, 30L, 32L, 36L, 77L, 91L, 99L), c( 1L, 2L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 12L, 16L, 17L, 18L, 21L, 22L, 23L, 24L, 25L, 26L, 28L, 29L, 31L, 33L, 34L, 35L, 37L, 38L, 39L, 40L, 42L, 44L, 45L, 46L, 47L, 50L, 51L, 52L, 53L, 54L, 56L, 57L, 59L, 60L, 62L, 65L, 68L, 69L, 71L, 72L, 73L, 76L, 78L, 80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 90L, 92L, 93L, 96L, 100L ), c( 13L, 15L, 19L, 20L, 27L, 41L, 43L, 55L, 58L, 64L, 66L, 67L, 74L, 75L, 79L, 89L ), c(9L, 48L, 49L, 61L, 63L, 70L, 94L, 95L, 97L, 98L) ) ), row.names = c(2L, 1L, 4L, 3L), class = "data.frame", .drop = TRUE ) ) expect_s3_class(x, "grouped_df") }) test_that("data_group attributes", { skip_if_not_installed("poorman") x <- data_group(efc, "c172code") out <- poorman::summarise(x, mw = mean(c12hour, na.rm = TRUE)) expect_equal(out$mw, c(87.125, 94.046875, 75), tolerance = 1e-3) }) # select helpers ------------------------------ test_that("data_group regex", { expect_identical( attributes(data_group(mtcars, select = "yl", regex = TRUE))$groups[[1]], sort(unique(mtcars$cyl)) ) }) test_that("data_ungroup works", { x <- data_group(efc, "c172code") attr(x, "foo") <- TRUE ungrouped <- data_ungroup(x) expect_false(inherits(ungrouped, "grouped_df")) expect_true(attributes(x)$foo) }) ================================================ FILE: tests/testthat/test-data_match.R ================================================ data(efc, package = "datawizard") test_that("data_match works as expected", { matching_rows <- data_match( mtcars, data.frame(vs = 0, am = 1), return_indices = TRUE ) df1 <- mtcars[matching_rows, ] expect_identical(unique(df1$vs), 0) expect_identical(unique(df1$am), 1) matching_rows <- data_match( mtcars, data.frame(vs = 0, am = c(0, 1)), return_indices = TRUE ) df2 <- mtcars[matching_rows, ] expect_identical(unique(df2$vs), 0) expect_identical(unique(df2$am), c(1, 0)) }) test_that("data_match works with missing data", { skip_if_not_installed("poorman") # "OR" works x1 <- length(data_match( efc, data.frame(c172code = 1, e16sex = 2), match = "or", return_indices = TRUE )) x2 <- nrow(poorman::filter(efc, c172code == 1 | e16sex == 2)) expect_identical(x1, x2) # "AND" works x1 <- length(data_match( efc, data.frame(c172code = 1, e16sex = 2), match = "and", return_indices = TRUE )) x2 <- nrow(poorman::filter(efc, c172code == 1, e16sex == 2)) expect_identical(x1, x2) # "NOT" works x1 <- length(data_match( efc, data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE )) x2 <- nrow(poorman::filter(efc, c172code != 1, e16sex != 2)) expect_identical(x1, x2) # remove NA x1 <- length(data_match( efc, data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE, remove_na = FALSE )) expect_identical(x1, 41L) x1 <- length(data_match( efc, data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE, remove_na = TRUE )) expect_identical(x1, 36L) }) test_that("data_match and data_filter work similar", { out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "not") out2 <- data_filter(mtcars, vs != 0 & am != 1) expect_equal(out1, out2, ignore_attr = TRUE) # using a data frame re-orders rows! out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "or") out2 <- data_filter(mtcars, vs == 0 | am == 1) expect_equal( out1[order(out1$vs, out1$am), ], out2[order(out2$vs, out2$am), ], ignore_attr = TRUE ) # string representation is working out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "or") out2 <- data_filter(mtcars, "vs == 0 | am == 1") expect_equal( out1[order(out1$vs, out1$am), ], out2[order(out2$vs, out2$am), ], ignore_attr = TRUE ) }) test_that("data_filter works", { out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "not") out2 <- data_filter(mtcars, vs != 0 & am != 1) out3 <- subset(mtcars, vs != 0 & am != 1) out4 <- data_filter(mtcars, vs != 0, am != 1) expect_equal(out1, out2, ignore_attr = TRUE) expect_equal(out1, out3, ignore_attr = TRUE) expect_equal(out2, out4, ignore_attr = TRUE) }) test_that("data_filter works with string representation", { out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "not") out2 <- data_filter(mtcars, "vs != 0 & am != 1") out3 <- subset(mtcars, vs != 0 & am != 1) out4 <- data_filter(mtcars, c("vs != 0", "am != 1")) expect_equal(out1, out2, ignore_attr = TRUE) expect_equal(out1, out3, ignore_attr = TRUE) expect_equal(out2, out3, ignore_attr = TRUE) expect_equal(out2, out4, ignore_attr = TRUE) }) test_that("data_filter works like slice", { out <- data_filter(mtcars, 5:10) expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE) out <- data_filter(mtcars, "5:10") expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE) slc <- 5:10 out <- data_filter(mtcars, slc) expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE) slc <- "5:10" out <- data_filter(mtcars, slc) expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE) }) test_that("data_filter gives informative message on errors", { expect_error( data_filter(mtcars, mpg = 10), "`==`" ) expect_error( data_filter(mtcars, "mpg > 10 || cyl = 4"), "`==`" ) expect_error( data_filter(mtcars, mpg > 10 || cyl == 4), "`||`" ) expect_error( data_filter(mtcars, mpg > 10 && cyl == 4), "`&&`" ) ## TODO: need to check why this fails on R 4.1 skip_if(getRversion() < "4.2.0") expect_error( data_filter(mtcars, mpg > 10?cyl == 4), "syntax" ) expect_error( data_filter(mtcars, mgp > 10?cyl == 4), "Variable \"mgp\"" ) }) test_that("data_filter gives informative message on errors", { data(mtcars) expect_error( data_filter(mtcars, cxl == 6), regex = "Variable \"cxl\"" ) expect_error( data_filter(mtcars, "cxl == 6"), regex = "Variable \"cxl\"" ) }) test_that("data_filter works with >= or <=", { expect_identical( data_filter(mtcars, "mpg >= 30.4"), subset(mtcars, mpg >= 30.4) ) expect_identical( data_filter(mtcars, mpg >= 30.4), subset(mtcars, mpg >= 30.4) ) expect_identical( data_filter(mtcars, "mpg <= 30.4"), subset(mtcars, mpg <= 30.4) ) expect_identical( data_filter(mtcars, mpg <= 30.4), subset(mtcars, mpg <= 30.4) ) mpgl30 <- "mpg <= 30.4" expect_identical( data_filter(mtcars, mpgl30), subset(mtcars, mpg <= 30.4) ) expect_identical( data_filter(mtcars, "mpg >= 30.4 & hp == 66"), subset(mtcars, mpg >= 30.4 & hp == 66) ) expect_identical( data_filter(mtcars, mpg <= 30.4 & hp == 66), subset(mtcars, mpg <= 30.4 & hp == 66) ) mpgl30hp66 <- "mpg >= 30.4 & hp == 66" expect_identical( data_filter(mtcars, mpgl30hp66), subset(mtcars, mpg >= 30.4 & hp == 66) ) }) test_that("programming with data_filter", { # One arg ------------ foo <- function(var) { data_filter(mtcars, var) } expect_identical( foo("mpg >= 30"), data_filter(mtcars, "mpg >= 30") ) foo2 <- function(data) { var2 <- "mpg >= 30" data_filter(data, var2) } expect_identical( foo2(mtcars), data_filter(mtcars, "mpg >= 30") ) foo3 <- function(data) { var <- "mpg >= 30" data_filter(data, var) } expect_identical( foo3(mtcars), data_filter(mtcars, "mpg >= 30") ) # Two args ----------- foo4 <- function(data, var3) { data_filter(data, var3) } expect_identical( foo4(mtcars, "mpg >= 30 & hp <= 66"), data_filter(mtcars, "mpg >= 30 & hp <= 66") ) }) test_that("programming with data_filter with variables", { var4 <- "mpg >= 30 & hp <= 66" expect_identical( data_filter(mtcars, var4), data_filter(mtcars, "mpg >= 30 & hp <= 66") ) var <- "mpg >= 30 & hp <= 66" expect_identical( data_filter(mtcars, var), data_filter(mtcars, "mpg >= 30 & hp <= 66") ) }) test_that("data_filter works with groups", { test <- data.frame( id = c(1, 1, 2, 2), x = c(0, 1, 3, 4), y = c(1, 2, 3, 4) ) test <- data_group(test, "id") expected <- data.frame(id = c(1, 2), x = c(0, 3), y = c(1, 3)) class(expected) <- c("grouped_df", "data.frame") attributes(expected)$groups <- attributes(test)$groups expect_equal( data_filter(test, x == min(x)), expected, ignore_attr = TRUE ) }) test_that("data_filter programming works with groups", { test <- data.frame( id = c(1, 1, 2, 2), x = c(0, 1, 3, 4), y = c(1, 2, 3, 4) ) test <- data_group(test, "id") expected <- data.frame(id = c(1, 2), x = c(0, 3), y = c(1, 3)) class(expected) <- c("grouped_df", "data.frame") attributes(expected)$groups <- attributes(test)$groups expect_equal( data_filter(test, "x == min(x)"), expected, ignore_attr = TRUE ) foo_gr1 <- function(data, var) { data_filter(data, var) } out <- foo_gr1(test, "x == min(x)") expect_equal(out, expected, ignore_attr = TRUE) }) test_that("data_filter with groups, different ways of dots", { grp <- data_group(mtcars, "cyl") fli <- "mpg <= 20" out1 <- data_filter(grp, mpg <= 20) out2 <- data_filter(grp, "mpg <= 20") out3 <- data_filter(grp, fli) expect_identical(out1, out2) expect_identical(out1, out3) }) test_that("data_filter, slicing works with functions", { d <- data.frame( a = c("aa", "a1", "bb", "b1", "cc", "c1"), b = 1:6, stringsAsFactors = FALSE ) rows <- grep("^[A-Za-z][0-9]$", x = d$a) out1 <- data_filter(d, rows) out2 <- data_filter(d, grep("^[A-Za-z][0-9]$", x = d$a)) expect_identical(out1, out2) out3 <- data_filter(iris, (Sepal.Width == 3.0) & (Species == "setosa")) expect_identical(nrow(out3), 6L) # styler: off expect_error( data_filter(iris, (Sepal.Width = 3.0) & (Species = "setosa")), # nolint regex = "Filtering did not work" ) # styler: on }) test_that("data_filter works with tibbles", { skip_if_not_installed("tibble") skip_if_not_installed("dplyr") data(mtcars) # preserve class d <- tibble::as_tibble(mtcars) out <- data_filter(d, mpg > 15) expect_s3_class(out, "tbl_df") # preserve attributes d <- tibble::as_tibble(mtcars) d <- dplyr::group_by(d, cyl) out <- data_filter(d, mpg > 15) expect_s3_class(out, "tbl_df") expect_named(attr(out, "groups"), c("cyl", ".rows")) }) ================================================ FILE: tests/testthat/test-data_merge.R ================================================ data(mtcars) x <- mtcars[3:5, 1:3] y <- mtcars[30:32, c(1, 4:5)] z <- mtcars[11:13, 6:8] x$id <- 1:3 y$id <- 2:4 z$id <- 3:5 # left ----------------------- test_that("left-join", { skip_if_not_installed("poorman") out <- data_merge(x, y, join = "left") expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat")) expect_identical(dim(out), c(3L, 6L)) expect_identical(out, suppressMessages(poorman::left_join(x, y))) out <- data_merge(x, y, join = "left", by = "id") expect_identical( colnames(out), c("cyl", "disp", "id", "hp", "drat", "mpg.x", "mpg.y") ) expect_identical(out$disp, poorman::left_join(x, y, by = "id")$disp) expect_identical(dim(out), c(3L, 7L)) out <- data_merge(x, y, join = "left", by = "mpg") expect_identical( colnames(out), c("mpg", "cyl", "disp", "hp", "drat", "id.x", "id.y") ) expect_identical(out$disp, poorman::left_join(x, y, by = "mpg")$disp) expect_identical(out$mpg, poorman::left_join(x, y, by = "mpg")$mpg) expect_identical(dim(out), c(3L, 7L)) }) # semi/anti ----------------------- # errors test_that("semi-anti-join", { expect_error(data_merge(x, y, join = "semi")) expect_error(data_merge(x, y, join = "anti")) }) # right ----------------------- test_that("right-join", { skip_if_not_installed("poorman") out <- data_merge(x, y, join = "right") expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat")) expect_identical(dim(out), c(3L, 6L)) # in data_merge(), we keep sorting from x, so do some preparation here poor_out <- suppressMessages(poorman::right_join(x, y)) poor_out <- poor_out[order(poor_out$id), ] row.names(poor_out) <- seq_len(nrow(poor_out)) expect_identical(out, poor_out) out <- data_merge(x, y, join = "right", by = "id") expect_identical( colnames(out), c("cyl", "disp", "id", "hp", "drat", "mpg.x", "mpg.y") ) # in data_merge(), we keep sorting from x, so do some preparation here poor_out <- suppressMessages(poorman::right_join(x, y, by = "id")) poor_out <- poor_out[order(poor_out$id), ] expect_identical(out$disp, poor_out$disp) expect_identical(dim(out), c(3L, 7L)) out <- data_merge(x, y, join = "right", by = "mpg") expect_identical( colnames(out), c("mpg", "cyl", "disp", "hp", "drat", "id.x", "id.y") ) # in data_merge(), we keep sorting from x, so do some preparation here poor_out <- suppressMessages(poorman::right_join(x, y, by = "mpg")) poor_out <- poor_out[order(poor_out$id.y, decreasing = TRUE), ] out <- out[order(out$id.y, decreasing = TRUE), ] expect_identical(out$disp, poor_out$disp) expect_identical(out$mpg, poor_out$mpg) expect_identical(dim(out), c(3L, 7L)) }) # inner ----------------------- test_that("inner-join", { skip_if_not_installed("poorman") out <- data_merge(x, y, join = "inner") expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat")) expect_identical(dim(out), c(0L, 6L)) out <- data_merge(x, y, join = "inner", by = "id") expect_identical( colnames(out), c("cyl", "disp", "id", "hp", "drat", "mpg.x", "mpg.y") ) expect_identical(out$disp, poorman::inner_join(x, y, by = "id")$disp) expect_identical(dim(out), c(2L, 7L)) out <- data_merge(x, y, join = "inner", by = "mpg") expect_identical( colnames(out), c("mpg", "cyl", "disp", "hp", "drat", "id.x", "id.y") ) expect_identical(out$disp, poorman::inner_join(x, y, by = "mpg")$disp) expect_identical(dim(out), c(1L, 7L)) }) # full ----------------------- test_that("full-join", { out <- data_merge(x, y, join = "full") expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat")) expect_identical(dim(out), c(6L, 6L)) expect_identical( out$mpg, c(22.8, 21.4, 18.7, 19.7, 15, 21.4), tolerance = 1e-2 ) expect_identical(out$id, c(1, 2, 3, 2, 3, 4), tolerance = 1e-2) out <- data_merge(x, y, join = "full", by = "id") expect_identical( colnames(out), c("cyl", "disp", "id", "hp", "drat", "mpg.x", "mpg.y") ) expect_identical(dim(out), c(4L, 7L)) expect_identical(out$mpg.x, c(22.8, 21.4, 18.7, NA), tolerance = 1e-2) expect_identical(out$id, 1:4, tolerance = 1e-2) out <- data_merge(x, y, join = "full", by = "mpg") expect_identical( colnames(out), c("mpg", "cyl", "disp", "hp", "drat", "id.x", "id.y") ) expect_identical(dim(out), c(5L, 7L)) expect_identical(out$mpg, c(22.8, 21.4, 18.7, 19.7, 15), tolerance = 1e-2) expect_identical(out$id.x, c(1, 2, 3, NA, NA), tolerance = 1e-2) out <- data_merge(x, y, join = "full", by = c("id", "mpg")) expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat")) expect_identical(dim(out), c(6L, 6L)) expect_identical( out$mpg, c(22.8, 21.4, 18.7, 19.7, 15, 21.4), tolerance = 1e-2 ) expect_identical(out$id, c(1, 2, 3, 2, 3, 4), tolerance = 1e-2) }) # bind ----------------------- test_that("bind-join", { skip_if_not_installed("poorman") out <- data_merge(x, y, join = "bind") poor_out <- poorman::bind_rows(x, y) row.names(poor_out) <- seq_len(nrow(poor_out)) expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat")) expect_identical(dim(out), c(6L, 6L)) expect_identical(out, poor_out) # by will be ignored out <- data_merge(x, y, join = "bind", by = "id") expect_identical(out, poor_out) # by will be ignored out <- data_merge(x, y, join = "bind", by = "mpg") expect_identical(out, poor_out) # by will be ignored out <- data_merge(x, y, join = "bind", by = c("id", "mpg")) expect_identical(out, poor_out) x <- mtcars[1, ] y <- mtcars[2, ] expect_warning( { out <- data_merge(x, y, join = "bind", id = "mpg") }, regexp = "already exists" ) expect_named( out, c(names(mtcars), "mpg_1") ) expect_identical(out$mpg_1, c(1, 2)) }) # joins without common columns ----------------------- test_that("bind-join", { skip_if_not_installed("poorman") x2 <- mtcars[3:5, 1:3] y2 <- mtcars[30:32, 4:6] expect_warning( data_merge(x2, y2, join = "full"), "Found no matching columns in the data frames." ) expect_identical( suppressWarnings(data_merge(x2, y2, join = "full")), suppressMessages(poorman::full_join(x2, y2)), ignore_attr = TRUE ) expect_identical( data_merge(x2, y2, join = "bind"), poorman::bind_rows(x2, y2), ignore_attr = TRUE ) }) # joins without common columns ----------------------- test_that("compare bind and full joins", { x2 <- mtcars[3:5, 1:3] y2 <- mtcars[30:32, 3:6] expect_identical( data_merge(x2, y2, join = "full"), data_merge(x2, y2, join = "bind"), ignore_attr = TRUE ) }) # join data frames in a list ----------------------- test_that("join data frames in a list", { skip_if_not_installed("poorman") x <- mtcars[1:5, 1:3] y <- mtcars[28:31, 3:5] z <- mtcars[11:18, c(1, 3:4, 6:8)] x$id <- 1:5 y$id <- 4:7 z$id <- 3:10 dat <- data_merge(list(x, y, z), by = "id", id = "df", join = "bind") expect_identical( remove_empty(subset(poorman::filter(dat, df == 1), select = -df)), x, ignore_attr = TRUE ) expect_identical( remove_empty(subset(poorman::filter(dat, df == 2), select = -c(df, id))), subset(y, select = -id), ignore_attr = TRUE ) expect_identical( remove_empty(subset(poorman::filter(dat, df == 3), select = -c(df, id))), subset(z, select = -id), ignore_attr = TRUE ) x <- mtcars[1, ] y <- mtcars[2, ] expect_warning( { out <- data_merge(list(x, y), join = "bind", id = "mpg") }, regexp = "already exists" ) expect_named( out, c(names(mtcars), "mpg_1") ) expect_identical(out$mpg_1, c(1, 2)) }) # join empty data frames ----------------------- x <- data.frame(x = character(), stringsAsFactors = FALSE) y <- data.frame(x = character(), stringsAsFactors = FALSE) z <- data.frame(y = character(), stringsAsFactors = FALSE) test_that("join empty data frames", { expect_identical(dim(data_merge(x, y, join = "left")), c(0L, 1L)) expect_identical(dim(data_merge(x, y, join = "full")), c(0L, 1L)) expect_identical(dim(data_merge(x, y, join = "right")), c(0L, 1L)) expect_identical(dim(data_merge(x, y, join = "bind")), c(0L, 1L)) expect_identical(dim(data_merge(x, z, join = "bind")), c(0L, 2L)) }) # join when all "by" are not present --------------------- test_that("join when all 'by' are not present", { x <- mtcars[, c("mpg", "drat", "cyl", "qsec")] y <- mtcars[, c("mpg", "hp", "cyl", "wt")] expect_error( { out <- data_merge(x, y, by = c("mpg", "drat", "qsec")) }, regexp = "Not all columns" ) }) # no warning for tibble #404 --------------------- test_that("no warning for tibble when checking if column exist", { skip_if_not_installed("tibble") d_tibble <- tibble::as_tibble(iris) expect_silent(data_merge(d_tibble, d_tibble[20:30, ], join = "bind")) }) ================================================ FILE: tests/testthat/test-data_modify.R ================================================ ## styler: off test_that("data_modify works", { data(iris) out <- data_modify( iris, Sepal_W_z = standardize(Sepal.Width), Sepal_Wz_double = 2 * Sepal_W_z ) expect_equal( out$Sepal_W_z, as.vector(scale(iris$Sepal.Width)), ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( out$Sepal_Wz_double, 2 * as.vector(scale(iris$Sepal.Width)), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_modify works with strings", { data(iris) out <- data_modify( iris, as_expr("Sepal_W_z = standardize(Sepal.Width)") ) expect_equal( out$Sepal_W_z, as.vector(scale(iris$Sepal.Width)), ignore_attr = TRUE, tolerance = 1e-3 ) out <- data_modify( iris, as_expr(c( "Sepal_W_z = standardize(Sepal.Width)", "Sepal_Wz_double = 2 * Sepal_W_z" )) ) expect_equal( out$Sepal_Wz_double, 2 * as.vector(scale(iris$Sepal.Width)), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_modify preserves labels", { data(efc) out <- data_modify( efc, c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE) ) expect_identical( attributes(out$c12hour_c)$label, attributes(efc$c12hour)$label ) expect_identical( attributes(out$c12hour_z)$label, attributes(efc$c12hour)$label ) out <- data_modify( efc, as_expr(c( "c12hour_c = center(c12hour)", "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)" )) ) expect_identical( attributes(out$c12hour_c)$label, attributes(efc$c12hour)$label ) expect_identical( attributes(out$c12hour_z)$label, attributes(efc$c12hour)$label ) }) test_that("data_modify recycling works", { data(iris) out <- data_modify(iris, x = 1) expect_equal(out$x, rep(1, nrow(iris)), ignore_attr = TRUE) out <- data_modify(iris, x = c(1, 2)) expect_equal(out$x, rep(c(1, 2), nrow(iris) / 2), ignore_attr = TRUE) expect_error(data_modify(iris, x = 1:4), regex = "same length") out <- data_modify(iris, x = "a") expect_equal(out$x, rep("a", nrow(iris)), ignore_attr = TRUE) }) test_that("data_modify recycling works with grouped df", { data(iris) d <- data_group(iris, "Species") expect_silent(data_modify(d, x = 1, test = 1:2)) }) test_that("data_modify expression in character vector-1", { data(iris) x <- "var_a = Sepal.Width" out <- data_modify(iris, as_expr(x)) expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "var_a" ) ) }) test_that("data_modify expression in character vector-2", { data(iris) foo <- function(data) { y <- "var_a = Sepal.Width" head(data_modify(data, as_expr(y))) } out <- foo(iris) expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "var_a" ) ) expect_identical(out$var_a, out$Sepal.Width) foo2 <- function(data, z) { head(data_modify(data, as_expr(z))) } out <- foo2(iris, "var_a = Sepal.Width") expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "var_a" ) ) expect_identical(out$var_a, out$Sepal.Width) }) test_that("data_modify expression in character vector-3", { data(iris) aa <- "2 * Sepal.Width" out <- data_modify(iris, new_var = as_expr(aa)) expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "new_var" ) ) expect_identical(out$new_var, 2 * out$Sepal.Width) aa <- "2 * Sepal.Width" out <- data_modify(iris, new_var = as_expr(aa)) expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "new_var" ) ) expect_identical(out$new_var, 2 * out$Sepal.Width) foo_nv <- function(data, z) { head(data_modify(data, new_var = as_expr(z))) } out <- foo_nv(iris, "2 * Sepal.Width") expect_identical( colnames(out), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "new_var" ) ) expect_identical(out$new_var, 2 * out$Sepal.Width) }) test_that("data_modify expression as character vector-4", { data(iris) x <- "var_a = Sepal.Width" y <- "Sepal_Wz_double = 2 * var_a" out <- data_modify(iris, as_expr(c(x, y))) expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "var_a", "Sepal_Wz_double" ) ) expect_identical(out$var_a, out$Sepal.Width) expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width) foo1 <- function(data) { x1 <- "var_a = Sepal.Width" y1 <- "Sepal_Wz_double = 2 * var_a" combined <- c(x1, y1) data_modify(iris, as_expr(combined)) } out <- foo1(iris) expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "var_a", "Sepal_Wz_double" ) ) expect_identical(out$var_a, out$Sepal.Width) expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width) foo2 <- function(data, z3) { data_modify(data, as_expr(z3)) } out <- foo2(iris, c("var_a = Sepal.Width", "Sepal_Wz_double = 2 * var_a")) expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "var_a", "Sepal_Wz_double" ) ) expect_identical(out$var_a, out$Sepal.Width) expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width) # works with separated strings data(iris) out <- data_modify( iris, as_expr("var_a = Sepal.Width"), as_expr("Sepal_Wz_double = 2 * var_a") ) expect_identical(out$var_a, out$Sepal.Width) expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width) out <- data_modify( iris, as_expr(c("var_a = Sepal.Width", "Sepal_Wz_double = 2 * var_a")) ) expect_identical(out$var_a, out$Sepal.Width) expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width) }) test_that("data_modify works with function as expression", { data(iris) out <- data_modify(iris, foo = grepl("a", Species)) # nolint expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) out <- data_modify(iris, foo = as_expr("grepl(\"a\", Species)")) expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) out <- data_modify(iris, as_expr("foo = grepl(\"a\", Species)")) expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) out <- data_modify(iris, foo = as_expr("grepl('a', Species)")) expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) out <- data_modify(iris, as_expr("foo = grepl('a', Species)")) expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) out <- data_modify(iris, foo = as_expr('grepl(\'a\', Species)')) # nolint expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) out <- data_modify(iris, as_expr('foo = grepl(\'a\', Species)')) # nolint expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) out <- data_modify(iris, foo = as_expr('grepl(\"a\", Species)')) expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) out <- data_modify(iris, as_expr('foo = grepl(\"a\", Species)')) expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50)) }) test_that("data_modify remove variables with NULL", { data(iris) out <- data_modify(iris, PL_new = 2 * Petal.Length, Petal.Length = NULL) expect_named( out, c("Sepal.Length", "Sepal.Width", "Petal.Width", "Species", "PL_new") ) expect_identical(out$PL_new, 2 * iris$Petal.Length) out <- data_modify(iris, as_expr("Species = NULL")) expect_named( out, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) }) test_that("data_modify works on grouped data", { data(efc) grouped_efc <- data_group(efc, "c172code") out <- data_modify( grouped_efc, c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), c12hour_z2 = standardize(c12hour) ) out2 <- lapply(by(efc["c12hour"], efc$c172code, scale), as.vector) expect_equal( na.omit(out$c12hour_z2[out$c172code == 1]), out2[[1]], ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_modify works on grouped data, with character vectors", { data(efc) grouped_efc <- data_group(efc, "c172code") out <- data_modify( grouped_efc, as_expr(c( "c12hour_c = center(c12hour)", "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)", "c12hour_z2 = standardize(c12hour)" )) ) out2 <- lapply(by(efc["c12hour"], efc$c172code, scale), as.vector) expect_equal( na.omit(out$c12hour_z2[out$c172code == 1]), out2[[1]], ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( na.omit(out$c12hour_z[out$c172code == 1]), out2[[1]], ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_modify works on grouped data, preserves attributes and labels", { data(efc) grouped_efc <- data_group(efc, "c172code") out <- data_modify( grouped_efc, c12hour_c = center(c12hour) ) expect_identical( attributes(out$c12hour)$label, attributes(efc$c12hour)$label ) }) test_that("data_modify works on grouped data, inside functions", { data(efc) foo4 <- function(data) { data_modify( data, as_expr(c( "c12hour_c = center(c12hour)", "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)", "c12hour_z2 = standardize(c12hour)" )) ) } out <- foo4(data_group(efc, "c172code")) out2 <- lapply(by(efc["c12hour"], efc$c172code, scale), as.vector) expect_equal( na.omit(out$c12hour_z2[out$c172code == 1]), out2[[1]], ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( na.omit(out$c12hour_z[out$c172code == 1]), out2[[1]], ignore_attr = TRUE, tolerance = 1e-3 ) foo5 <- function(data, rec) { data_modify(data, as_expr(rec)) } out <- foo5( data_group(efc, "c172code"), c( "c12hour_c = center(c12hour)", "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)", "c12hour_z2 = standardize(c12hour)" ) ) out2 <- lapply(by(efc["c12hour"], efc$c172code, scale), as.vector) expect_equal( na.omit(out$c12hour_z2[out$c172code == 1]), out2[[1]], ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( na.omit(out$c12hour_z[out$c172code == 1]), out2[[1]], ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_modify errors for non df", { expect_error(data_modify( iris$Sepal.Length, Sepal_W_z = standardize(Sepal.Width) )) }) test_that("data_modify errors for empty data frames", { data(mtcars) x <- mtcars[1, ] expect_error( data_modify(x[-1, ], new_var = 5), regex = "empty data frame" ) }) test_that("data_modify errors for typos", { data(efc) a <- "center(c22hour)" # <---------------- error in variable name b <- "c12hour_c / sd(c12hour, na.rm = TRUE)" expect_error( data_modify(efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b)), regex = "c22hour" ) a <- "center(c12hour)" b <- "c12hour_c / sd(c21hour, na.rm = TRUE)" # <------ error in variable name expect_error( data_modify(efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b)), regex = "c12hour_c" ) expect_error( data_modify(efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b)), regex = "second expression" ) }) test_that("data_modify message about recycling values", { expect_snapshot(head(data_modify(iris, Sepal.Width = 1))) expect_snapshot(head(data_modify(iris, Sepal.Width = 1:2))) expect_snapshot(head(data_modify(iris, Petal.Length = 1, Sepal.Width = 1))) expect_snapshot(head(data_modify(iris, Petal.Length = 1, Sepal.Width = 1:2))) expect_snapshot(head(data_modify(iris, Petal.Length = 2, Sepal.Width = 2))) }) test_that("data_modify message about modified variables", { expect_snapshot(head(data_modify(iris, Sepal.Width = 2 * Sepal.Width))) expect_snapshot(head(data_modify( iris, Petal.Length = Sepal.Length, Sepal.Width = Petal.Width ))) }) test_that("data_modify works with character variables, and inside functions", { data(efc) a <- "center(c12hour)" b <- "c12hour_c / sd(c12hour, na.rm = TRUE)" d <- "standardize(c12hour)" out <- data_modify( efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b), c12hour_z2 = as_expr(d) ) expect_equal( out$c12hour_z2, as.vector(scale(efc$c12hour)), ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( out$c12hour_z, as.vector(scale(efc$c12hour)), ignore_attr = TRUE, tolerance = 1e-3 ) # when calling functions a1 <- "center(c12hour)" b1 <- "c12hour_c / sd(c12hour, na.rm = TRUE)" d1 <- "standardize(c12hour)" foo <- function(data, x1, x2, x3) { data_modify( efc, c12hour_c = as_expr(x1), c12hour_z = as_expr(x2), c12hour_z2 = as_expr(x3) ) } out <- foo(efc, a1, b1, d1) expect_equal( out$c12hour_z2, as.vector(scale(efc$c12hour)), ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( out$c12hour_z, as.vector(scale(efc$c12hour)), ignore_attr = TRUE, tolerance = 1e-3 ) # when calling functions, arguments inside function defined foo2 <- function(data) { a2 <- "center(c12hour)" b2 <- "c12hour_c / sd(c12hour, na.rm = TRUE)" d2 <- "standardize(c12hour)" data_modify( efc, c12hour_c = as_expr(a2), c12hour_z = as_expr(b2), c12hour_z2 = as_expr(d2) ) } out <- foo2(efc) expect_equal( out$c12hour_z2, as.vector(scale(efc$c12hour)), ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( out$c12hour_z, as.vector(scale(efc$c12hour)), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_modify works with grouped df when overwriting existing variables", { data(iris) iris_grp <- data_group(iris, "Species") out <- data_modify(iris_grp, Sepal.Length = normalize(Sepal.Length)) expect_equal( head(out$Sepal.Length), c(0.53333, 0.4, 0.26667, 0.2, 0.46667, 0.73333), tolerance = 1e-3 ) out <- data_modify( iris_grp, Sepal.Length = normalize(Sepal.Length), Sepal.Length2 = 2 * Sepal.Length ) expect_equal( head(out$Sepal.Length2), 2 * c(0.53333, 0.4, 0.26667, 0.2, 0.46667, 0.73333), tolerance = 1e-3 ) }) test_that("data_modify works with functions that return character vectors", { data(iris) set.seed(123) out <- data_modify(iris, grp = sample(letters[1:3], nrow(iris), TRUE)) expect_identical(head(out$grp), c("a", "c", "b", "a", "c", "c")) }) test_that("data_modify 1:n() and similar works in (grouped) data frames", { data(mtcars) out <- data_modify(mtcars, Trials = 1:n()) # nolint expect_identical(out$Trials, 1:32) x <- data_group(mtcars, "gear") out <- data_modify(x, Trials = 1:n()) # nolint expect_identical(out$Trials[out$gear == 3], 1:15) expect_identical(out$Trials[out$gear == 4], 1:12) out <- data_modify(x, Trials = 3:(n() + 2)) expect_identical(out$Trials[out$gear == 3], 3:17) expect_identical(out$Trials[out$gear == 4], 3:14) }) test_that("data_modify .if/.at arguments", { data(iris) d <- iris[1:5, ] # validate results out <- data_modify(d, .at = "Species", .modify = as.numeric) expect_identical(out$Species, c(1, 1, 1, 1, 1)) out <- data_modify(d, .if = is.factor, .modify = as.numeric) expect_identical(out$Species, c(1, 1, 1, 1, 1)) out <- data_modify( d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric ) expect_identical(out$Species, c(1, 1, 1, 1, 1)) expect_named( out, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "new_length" ) ) # using other functions with `.at` out <- data_modify( d, .at = extract_column_names(d, select = starts_with("Sepal")), .modify = as.factor ) expect_s3_class(out$Sepal.Length, "factor") expect_s3_class(out$Sepal.Width, "factor") # .at and .if cannot be used at same timne expect_error( data_modify(d, .at = "Species", .if = is.factor, .modify = as.numeric), regex = "You cannot use both" ) # modify must be a function expect_error( data_modify(d, .at = "Species", .modify = "a"), regex = "`.modify` must" ) # unknown variable expect_error( data_modify(d, .at = c("Species", "Test"), .modify = as.numeric), regex = "Variable \"Test\"" ) # unknown variables expect_error( data_modify(d, .at = c("Species", "Hi", "Test"), .modify = as.numeric), regex = "Variables \"Hi\" and \"Test\"" ) # one of .at or .if must be specified expect_error( data_modify(d, .modify = as.numeric), regex = "You need to specify" ) # function not applicable to factors expect_error( data_modify(d, .at = "Species", .modify = function(x) 2 / y + x), regex = "Error in modifying variable" ) # function not applicable to factors expect_error( data_modify(d, .at = "Species", .modify = function(x) 2 * x), regex = "Error in modifying variable" ) # .modify needs to be specified expect_error( data_modify(d, .at = "Species", .if = is.factor), regex = "You need to specify" ) # newly created variables are processed by if/at out <- data_modify( d, new_length = Petal.Length * 2, .if = is.numeric, .modify = round ) expect_equal(out$new_length, c(3, 3, 3, 3, 3), ignore_attr = TRUE) }) test_that("data_modify works with new expressions, different use cases same results", { data(iris) out1 <- data_modify(iris, as_expr("sepwid = 2 * Sepal.Width")) out2 <- data_modify(iris, sepwid = as_expr("2 * Sepal.Width")) e <- "sepwid = 2 * Sepal.Width" out3 <- data_modify(iris, as_expr(e)) e <- "2 * Sepal.Width" out4 <- data_modify(iris, sepwid = as_expr(e)) expect_equal(head(out1), head(out2), ignore_attr = TRUE, tolerance = 1e-4) expect_equal(head(out1), head(out3), ignore_attr = TRUE, tolerance = 1e-4) expect_equal(head(out1), head(out4), ignore_attr = TRUE, tolerance = 1e-4) out1b <- data_modify( iris, as_expr(c("sepwid = 2 * Sepal.Width", "seplen = 5 * Sepal.Length")) ) out2b <- data_modify( iris, sepwid = as_expr("2 * Sepal.Width"), seplen = as_expr("5 * Sepal.Length") ) e <- c("sepwid = 2 * Sepal.Width", "seplen = 5 * Sepal.Length") out3b <- data_modify(iris, as_expr(e)) e <- "2 * Sepal.Width" out4b <- data_modify(iris, sepwid = as_expr(e), seplen = 5 * Sepal.Length) expect_equal(head(out1b), head(out2b), ignore_attr = TRUE, tolerance = 1e-4) expect_equal(head(out1b), head(out3b), ignore_attr = TRUE, tolerance = 1e-4) expect_equal(head(out1b), head(out4b), ignore_attr = TRUE, tolerance = 1e-4) # no expression out <- data_modify(iris, sepwid = "2 * Sepal.Widht") expect_identical( head(out$sepwid), c( "2 * Sepal.Widht", "2 * Sepal.Widht", "2 * Sepal.Widht", "2 * Sepal.Widht", "2 * Sepal.Widht", "2 * Sepal.Widht" ) ) # works with paste() to_standardize <- c("Petal.Length", "Sepal.Length") out <- data_modify( iris, as_expr( paste0(to_standardize, "_stand = standardize(", to_standardize, ")") ) ) expect_equal( head(out$Petal.Length_stand), c(-1.33575, -1.33575, -1.3924, -1.2791, -1.33575, -1.16581), tolerance = 1e-3 ) expect_equal( head(out$Sepal.Length_stand), c(-0.89767, -1.1392, -1.38073, -1.50149, -1.01844, -0.53538), tolerance = 1e-3 ) # complex example e <- "2 * Sepal.Width" f <- "half_petal = 0.5 * Petal.Length" a <- "string" num <- 1:5 out_complex <- data_modify( iris, sepwid = as_expr(e), seplen = 5 * Sepal.Length, as_expr(f), new_var = a, new_num = num, new_var2 = "ho", new_num2 = 4:6, Sepal.Length = NULL, Petal.Length = NULL, Sepal.Width = NULL, Petal.Width = NULL ) expect_snapshot(print(head(out_complex))) }) test_that("data_modify works with new expressions, grouped_df, different use cases same results", { data(efc, package = "datawizard") grouped_efc <- data_group(efc, "c172code") new_efc1 <- data_modify( grouped_efc, c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), c12hour_z2 = standardize(c12hour), id = 1:n() # nolint ) new_efc2 <- data_modify( grouped_efc, as_expr("c12hour_c = center(c12hour)"), c12hour_z = as_expr("c12hour_c / sd(c12hour, na.rm = TRUE)"), c12hour_z2 = standardize(c12hour), id = 1:n() # nolint ) expect_equal( head(new_efc1), head(new_efc2), ignore_attr = TRUE, tolerance = 1e-4 ) s <- c( "c12hour_c = center(c12hour)", "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)", "c12hour_z2 = standardize(c12hour)" ) new_efc3 <- data_modify( grouped_efc, as_expr(s), id = 1:n() # nolint ) expect_equal( head(new_efc1), head(new_efc3), ignore_attr = TRUE, tolerance = 1e-4 ) new_efc4 <- data_modify( grouped_efc, c12hour_c = center(c12hour), c12hour_z = as_expr("c12hour_c / sd(c12hour, na.rm = TRUE)"), c12hour_z2 = standardize(c12hour), id = 1:n() # nolint ) expect_equal( head(new_efc1), head(new_efc4), ignore_attr = TRUE, tolerance = 1e-4 ) }) test_that("data_modify errors with new expressions", { e <- "sepwid = 2 * Sepal.Widht" expect_error( data_modify(iris, as_expr(e)), regex = "in the first expression" ) expect_error( data_modify(iris, as_expr(e)), regex = "Sepal.Widht" ) expect_error( data_modify(iris, as_expr("sepwid = 2 * Sepal.Widht")), regex = "in the first expression" ) expect_error( data_modify(iris, as_expr("sepwid = 2 * Sepal.Widht")), regex = "Sepal.Widht" ) expect_error( data_modify(iris, sepwid = 2 * Sepal.Widht), regex = "in the first expression" ) expect_error( data_modify(iris, sepwid = 2 * Sepal.Widht), regex = "Sepal.Widht" ) expect_error( data_modify(iris, as_expr("2 * Sepal.Widht")), regex = "variable name" ) e <- "2 * Sepal.Widht" expect_error( data_modify(iris, as_expr(e)), regex = "variable name" ) data(efc, package = "datawizard") a <- "center(c22hour)" # <---------------- error in variable name b <- "c12hour_c / sd(c12hour, na.rm = TRUE)" expect_error( data_modify(efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b)), regex = "c22hour" ) expect_error( data_modify(iris, a = as_expr(c("1 + 1", "2 + 2"))), regex = "Could not evaluate expression" ) }) skip_if_not_installed("withr") withr::with_environment( new.env(), test_that("data_modify 1:n() and similar works in (grouped) data frames inside function calls", { data(mtcars) x <- data_group(mtcars, "gear") foo <- function(d) { out <- data_modify(d, Trials = 1:n()) # nolint out$Trials } expect_identical( foo(x), c( 1L, 2L, 3L, 1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 5L, 6L, 7L, 8L, 9L, 10L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 11L, 1L, 2L, 3L, 4L, 5L, 12L ) ) }) ) test_that("data_modify errors on non-defined function", { expect_error(data_modify(iris, Species = foo())) }) withr::with_environment( new.env(), test_that("data_modify correctly assigns values from variables", { d <- data.frame() for (param in letters[c(1, 2, 5)]) { out <- data.frame(x = as.numeric(as.factor(param))) out <- data_modify(out, Parameter = param) d <- rbind(out, d) } expect_named(d, c("x", "Parameter")) expect_identical(d$Parameter, c("e", "b", "a")) d <- data.frame() for (param in c("a 1", "b 2")) { out <- data.frame(x = as.numeric(as.factor(param))) out <- data_modify(out, Parameter = param) d <- rbind(out, d) } expect_named(d, c("x", "Parameter")) expect_identical(d$Parameter, c("b 2", "a 1")) # variable is not copied, values is used a <- "x" d <- data.frame(x = 1) out <- data_modify(d, y = a) expect_identical(out$y, "x") }) ) withr::with_environment( new.env(), test_that("data_modify passes expression syntax to function", { foo1 <- function(data, ...) { head(data_modify(data, ...)) } out1 <- foo1(iris, SW_fraction = Sepal.Width / 10) out2 <- foo1(iris, as_expr("SW_fraction = Sepal.Width / 10")) expect_identical(out1, out2) }) ) ## styler: on ================================================ FILE: tests/testthat/test-data_partition.R ================================================ test_that("data_partition works as expected", { # not supported expect_error( data_partition(new.env()), "`data` must be a data frame" ) # to be coerced to data frames expect_snapshot(data_partition(letters, seed = 123)) # validation checks expect_warning( data_partition(iris, 0.7, row_id = "Species"), "exists" ) expect_warning(expect_warning( data_partition(iris, c(0.7, 0.3), row_id = "Species"), "generated" )) # values out <- data_partition(mtcars, proportion = 0.8, seed = 123) expect_identical( out$p_0.8$.row_id, c( 1L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 11L, 14L, 15L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L ) ) expect_identical( colnames(out$p_0.8), c( "mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb", ".row_id" ) ) expect_identical( lapply(out, nrow), list(p_0.8 = 26L, test = 6L) ) # data frames data(iris) expect_snapshot(str(data_partition(iris, proportion = 0.7, seed = 123))) expect_snapshot(str(data_partition( iris, proportion = c(0.2, 0.5), seed = 123 ))) expect_snapshot(str(data_partition( iris, proportion = 0.7, by = "Species", seed = 123 ))) expect_snapshot(str(data_partition( iris, proportion = c(0.2, 0.5), by = "Species", seed = 123 ))) }) test_that("data_partition warns if no testing set", { expect_warning( data_partition(iris, proportion = 1), "sums up to 1" ) expect_warning( data_partition(iris, proportion = c(0.5, 0.5)), "sums up to 1" ) }) test_that("data_partition errors if values in proportion not between 0 and 1", { expect_error( data_partition(iris, proportion = 1.3), "cannot be higher" ) expect_error( data_partition(iris, proportion = c(0.5, 0.6)), "cannot be higher" ) expect_error( data_partition(iris, proportion = c(1.3, -1)), "cannot be negative" ) expect_error( data_partition(iris, proportion = -1), "cannot be negative" ) }) test_that("data_partition warns if row_id already exists", { iris2 <- iris iris2[[".row_id"]] <- "A" expect_warning( data_partition(iris2, proportion = 0.5), "already exists" ) iris2[["foo"]] <- "A" expect_warning( data_partition(iris2, proportion = 0.5, row_id = "foo"), "already exists" ) part1 <- data_partition(iris, proportion = 0.5, seed = 123) part2 <- suppressWarnings(data_partition(iris2, proportion = 0.5, seed = 123)) expect_identical( part1$p_0.5[1:5], part2$p_0.5[1:5] ) }) ================================================ FILE: tests/testthat/test-data_peek.R ================================================ test_that("data_peek works as expected", { out <- data_peek(iris) expect_named(out, c("Variable", "Type", "Values")) expect_identical( out$Variable, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) expect_identical(dim(out), c(5L, 3L)) }) test_that("data_peek works as expected with select", { out <- data_peek(iris, select = 2:4) expect_named(out, c("Variable", "Type", "Values")) expect_identical( out$Variable, c("Sepal.Width", "Petal.Length", "Petal.Width") ) expect_identical(dim(out), c(3L, 3L)) }) test_that("data_peek works as expetced with custom width", { out <- data_peek(iris, width = 130) expect_named(out, c("Variable", "Type", "Values")) expect_identical( out$Variable, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) expect_identical(dim(out), c(5L, 3L)) }) test_that("data_peek snapshots look as expected", { expect_snapshot(data_peek(iris)) expect_snapshot(data_peek(iris, select = 1:3)) expect_snapshot(data_peek(iris, width = 130)) }) ================================================ FILE: tests/testthat/test-data_read.R ================================================ skip_if_not_installed("httr") skip_if_not_installed("readxl") skip_if_not_installed("haven") skip_if_not_installed("readr") skip_if_not_installed("data.table") skip_if_not_installed("rio") skip_on_cran() skip_if_not_installed("curl") skip_if_offline() # csv ------------------------- test_that("data_read - csv", { d <- data_read( "https://raw.githubusercontent.com/easystats/circus/main/data/bootstrapped.csv", verbose = FALSE ) expect_identical(dim(d), c(10000L, 4L)) }) # csv ------------------------- test_that("data_read, skip_empty", { d <- data_read( "https://raw.githubusercontent.com/easystats/circus/main/data/test_skip_empty.csv", verbose = FALSE ) expect_identical(ncol(d), 3L) expect_identical(colnames(d), c("Var1", "Var2", "Var3")) }) # tsv ------------------------- test_that("data_read - tsv", { skip_if_not_installed("withr") withr::with_tempfile("temp_file", fileext = ".tsv", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/sample1.tsv" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, verbose = FALSE ) expect_identical(nrow(d), 3L) expect_identical(colnames(d), c("a", "b", "c")) expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 2L) expect_identical(sum(vapply(d, is.character, FUN.VALUE = logical(1L))), 1L) }) }) # excel ------------------------- test_that("data_read - excel", { skip_if_not_installed("withr") withr::with_tempfile("temp_file", fileext = ".xlsx", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/sample1.xlsx" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, verbose = FALSE ) expect_identical(nrow(d), 3L) expect_identical(colnames(d), c("a", "b", "c")) expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 2L) expect_identical(sum(vapply(d, is.character, FUN.VALUE = logical(1L))), 1L) }) }) # Stata file ----------------------------------- test_that("data_read - Stata file", { withr::with_tempfile("temp_file", fileext = ".dta", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/stata_test.dta" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, verbose = FALSE ) expect_identical( d, data.frame( mpg = c(21, 21, 22.8), cyl = c(6, 6, 4), disp = c(160, 160, 108) ) ) }) }) # SAS file ----------------------------------- test_that("data_read - SAS file", { withr::with_tempfile("temp_file", fileext = ".sas7bdat", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/sas_test.sas7bdat" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, verbose = FALSE ) expect_identical( d, data.frame( mpg = c(21, 21, 22.8), cyl = c(6, 6, 4), disp = c(160, 160, 108) ) ) }) }) # RDS file, matrix, coercible ----------------------------------- test_that("data_read - RDS file, matrix, coercible", { withr::with_tempfile("temp_file", fileext = ".rds", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/matrix_object.rds" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) expect_message({ d <- data_read( temp_file, verbose = TRUE ) }) expect_s3_class(d, "data.frame") expect_identical(dim(d), c(2L, 5L)) }) }) # RDS file, preserve class /types ----------------------------------- test_that("data_read - RDS file, preserve class", { withr::with_tempfile("temp_file", fileext = ".rds", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/hiv.rds" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read(temp_file, verbose = FALSE) expect_s3_class(d, "data.frame") expect_identical( sapply(d, class), c( village = "integer", outcome = "integer", distance = "numeric", amount = "numeric", incentive = "integer", age = "integer", hiv2004 = "integer", agecat = "factor" ) ) }) }) # RData ----------------------------------- test_that("data_read - no warning for RData", { withr::with_tempfile("temp_file", fileext = ".RData", code = { data(mtcars) save(mtcars, file = temp_file) expect_silent(data_read(temp_file, verbose = FALSE)) }) }) test_that("data_read - message for multiple objects in RData", { withr::with_tempfile("temp_file", fileext = ".RData", code = { data(mtcars) data(iris) save(mtcars, iris, file = temp_file) expect_message( expect_message( data_read(temp_file, verbose = TRUE), regex = "File contained more than one object" ), "Reading data" ) }) }) # SPSS file ----------------------------------- test_that("data_read - SPSS file", { withr::with_tempfile("temp_file", fileext = ".sav", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/EFC.sav" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, verbose = FALSE ) expect_identical(sum(vapply(d, is.factor, FUN.VALUE = logical(1L))), 15L) expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 11L) expect_identical( levels(d$c172code), c( "low level of education", "intermediate level of education", "high level of education" ) ) expect_identical( attr(d$n4pstu, "labels"), c( `spouse/partner` = 1, child = 2, sibling = 3, `daughter or son -in-law` = 4 ) ) }) }) # SPSS file 2 --------------------------------- test_that("data_read - SPSS file 2", { withr::with_tempfile("temp_file", fileext = ".sav", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/spss_test.sav" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, verbose = FALSE ) expect_identical( d, structure( list( V1 = structure( 1:4, levels = c( "Eins", "Zwei", "Drei", "Vier" ), class = "factor", converted_to_factor = TRUE, label = "Variable 1" ), V2 = structure( c(2, 3, 4, 1), labels = c( Eins = 1, Zwei = 2, Drei = 3 ), label = "Variable 2" ), V3 = structure( c( 3L, 2L, 1L, 4L ), levels = c("Eins", "Zwei", "Drei", "Vier"), class = "factor", converted_to_factor = TRUE, label = "Variable 3" ) ), row.names = c(NA, -4L), class = "data.frame" ) ) }) }) # zipped SPSS file ----------------------------------- test_that("data_read - zipped SPSS file", { withr::with_tempfile("temp_file", fileext = ".zip", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/EFC.zip" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, verbose = FALSE ) expect_identical(sum(vapply(d, is.factor, FUN.VALUE = logical(1L))), 15L) expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 11L) d <- data_read( temp_file, convert_factors = FALSE, verbose = FALSE ) expect_identical(sum(vapply(d, is.factor, FUN.VALUE = logical(1L))), 0L) expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 26L) }) }) # SPSS file, many value labels ----------------------------------- test_that("data_read, convert many labels correctly", { # Output validated against SPSS output from original dataset withr::with_tempfile("temp_file", fileext = ".sav", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/spss_many_labels.sav" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, verbose = FALSE ) # all are factors by default expect_identical( vapply(d, class, character(1)), c(selv1 = "factor", c12 = "factor", c12a = "factor", c12c = "factor") ) expect_identical( levels(d$selv1), c( "Vignette 1 weiblich (Gülsen E. Reinigungskraft B)", "Vignette 2 weiblich (Gülsen E. Anwältin B)", "Vignette 3 weiblich (Monika E. Reinigungskraft B)", "Vignette 4 weiblich (Monika E. Anwältin B)", "Vignette 5 männlich (Hasan E. Reinigungskraft B)", "Vignette 6 männlich (Hasan E. Anwalt B)", "Vignette 7 männlich (Martin E. Reinigungskraft B)", "Vignette 8 männlich (Martin E. Anwalt B)", "Vignette 9 weiblich (Gülsen E. Reinigungskraft E)", "Vignette 10 weiblich (Gülsen E. Anwältin E)", "Vignette 11 weiblich (Monika E. Reinigungskraft E)", "Vignette 12 weiblich (Monika E. Anwältin E)", "Vignette 13 männlich (Hasan E. Reinigungskraft E)", "Vignette 14 männlich (Hasan E. Anwalt E)", "Vignette 15 männlich (Martin E. Reinigungskraft E)", "Vignette 16 männlich (Martin E. Anwalt E)" ) ) expect_snapshot(data_tabulate(d$selv1)) expect_identical(levels(d$c12), c("ja", "nein", "keine Angabe")) expect_snapshot(data_tabulate(d$c12)) expect_identical(levels(d$c12a), c("Filter", "ja", "nein", "keine Angabe")) expect_snapshot(data_tabulate(d$c12a)) expect_identical( levels(d$c12c), c( "Filter", "0 = keine", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10 = sehr starke", "weiß nicht / keine Angabe" ) ) expect_snapshot(data_tabulate(d$c12c)) expect_message( expect_message( expect_message( data_read(temp_file), regexp = "Reading" ), regexp = "Variables where all" ), regexp = "4 out of 4" ) d <- data_read( temp_file, convert_factors = FALSE, verbose = FALSE ) # all are factors by default expect_identical( vapply(d, class, character(1)), c(selv1 = "numeric", c12 = "numeric", c12a = "numeric", c12c = "numeric") ) expect_snapshot(table(d$selv1)) expect_identical( attributes(d$selv1)$labels, c( `Vignette 1 weiblich (Gülsen E. Reinigungskraft B)` = 1, `Vignette 2 weiblich (Gülsen E. Anwältin B)` = 2, `Vignette 3 weiblich (Monika E. Reinigungskraft B)` = 3, `Vignette 4 weiblich (Monika E. Anwältin B)` = 4, `Vignette 5 männlich (Hasan E. Reinigungskraft B)` = 5, `Vignette 6 männlich (Hasan E. Anwalt B)` = 6, `Vignette 7 männlich (Martin E. Reinigungskraft B)` = 7, `Vignette 8 männlich (Martin E. Anwalt B)` = 8, `Vignette 9 weiblich (Gülsen E. Reinigungskraft E)` = 9, `Vignette 10 weiblich (Gülsen E. Anwältin E)` = 10, `Vignette 11 weiblich (Monika E. Reinigungskraft E)` = 11, `Vignette 12 weiblich (Monika E. Anwältin E)` = 12, `Vignette 13 männlich (Hasan E. Reinigungskraft E)` = 13, `Vignette 14 männlich (Hasan E. Anwalt E)` = 14, `Vignette 15 männlich (Martin E. Reinigungskraft E)` = 15, `Vignette 16 männlich (Martin E. Anwalt E)` = 16, `99` = 99 ) ) expect_snapshot(table(d$c12)) expect_identical( attributes(d$c12)$labels, c(Filter = -2, ja = 1, nein = 2, `keine Angabe` = 99) ) expect_snapshot(table(d$c12a)) expect_identical( attributes(d$c12a)$labels, c(Filter = -2, ja = 1, nein = 2, `keine Angabe` = 99) ) expect_snapshot(table(d$c12c)) expect_identical( attributes(d$c12c)$labels, c( Filter = -2, `0 = keine` = 0, `1` = 1, `2` = 2, `3` = 3, `4` = 4, `5` = 5, `6` = 6, `7` = 7, `8` = 8, `9` = 9, `10 = sehr starke` = 10, `weiß nicht / keine Angabe` = 99 ) ) }) }) # invalid file type ------------------------- test_that("data_read, no file extension", { expect_error(data_read("mytestfile"), regex = "extension") expect_error(data_read(NULL, regex = "extension")) }) # file not exists ------------------------- test_that("data_read, file not exists", { expect_error(data_read("thisfileshouldnotexist.csv"), regex = "not exist") expect_error( suppressMessages(data_read("thisfileshouldnotexist.sav")), regex = "not exist" ) }) # RDS file, no data frame ----------------------------------- test_that("data_read - RDS file, no data frame", { skip_if_not_installed("withr") withr::with_tempfile("temp_file", fileext = ".rda", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/list_for_testing.rda" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) expect_message( expect_warning( d <- data_read(temp_file, verbose = TRUE), # nolint regex = "no data frame" ), "Reading data" ) expect_type(d, "list") }) }) test_that("data_read - RDA file, model object", { skip_if_not_installed("withr") skip_if_not_installed("brms") withr::with_tempfile("temp_file", fileext = ".rds", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/model_object.rds" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) expect_message( expect_message( d <- data_read(temp_file, verbose = TRUE), # nolint regex = "Imported file is a regression" ), "Reading data" ) expect_s3_class(d, "lm") }) withr::with_tempfile("temp_file", fileext = ".rda", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/brms_1.rda" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) expect_message( expect_message( d <- data_read(temp_file, verbose = TRUE), # nolint regex = "Imported file is a regression" ), "Reading data" ) expect_s3_class(d, "brmsfit") }) }) test_that("data_read - RDS file, from URL", { # works with URL request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/model_object.rds" ) httr::stop_for_status(request) expect_message( expect_message( d <- data_read( # nolint "https://raw.github.com/easystats/circus/main/data/model_object.rds", verbose = TRUE ), regex = "Imported file is a regression" ), "Reading data" ) }) test_that("data_read - nanoparquet", { skip_if_not_installed("withr") skip_if_not_installed("nanoparquet") withr::with_tempfile("temp_file", fileext = ".parquet", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/penguins.parquet" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read(temp_file) expect_named( d, c( "species", "island", "bill_len", "bill_dep", "flipper_len", "body_mass", "sex", "year" ) ) expect_identical(dim(d), c(344L, 8L)) }) }) ================================================ FILE: tests/testthat/test-data_recode.R ================================================ # set recode pattern old=new -------------- options(data_recode_pattern = "old=new") # numeric ----------------------- set.seed(123) x <- sample(c(1:4, NA), 15, TRUE) test_that("recode numeric", { out <- recode_values(x, list(`1` = 0, `2:3` = 1, `4` = 2)) expect_equal( out, c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), ignore_attr = TRUE ) out <- recode_values( x, list(`1` = 0, `2:3` = 1, `4` = 2, `NA` = 9), preserve_na = FALSE ) expect_equal( out, c(1, 1, 1, 1, 1, 9, 2, 0, 1, 1, 9, 1, 1, 0, 2), ignore_attr = TRUE ) out <- recode_values(x, list(`1` = 0, `2:3` = 1, `4` = 2, `NA` = 9)) expect_equal( out, c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), ignore_attr = TRUE ) out <- recode_values( x, list(`1` = 0, `2` = 1), default = 99, preserve_na = FALSE ) expect_equal( out, c(99, 99, 1, 1, 99, 99, 99, 0, 1, 99, 99, 99, 99, 0, 99), ignore_attr = TRUE ) out <- recode_values(x, list(`1` = 0, `2` = 1), default = 99) expect_equal( out, c(99, 99, 1, 1, 99, NA, 99, 0, 1, 99, NA, 99, 99, 0, 99), ignore_attr = TRUE ) }) # Date ----------------------- set.seed(123) x <- as.Date("2022-01-01") test_that("recode date", { expect_message(recode_values(x)) }) # factor ----------------------- set.seed(123) x <- as.factor(sample(c("a", "b", "c"), 15, TRUE)) test_that("recode factor", { out <- recode_values(x, list(a = "x", `b, c` = "y")) expect_equal( out, structure( c( 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L ), .Label = c("x", "y"), class = "factor" ), ignore_attr = TRUE ) out <- recode_values(x, list(a = "x", `b, c` = "y")) expect_equal( out, structure( c( 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L ), .Label = c("x", "y"), class = "factor" ), ignore_attr = TRUE ) }) set.seed(123) x <- as.factor(sample(c("a", "b", "c", NA_character_), 15, TRUE)) test_that("recode factor", { out <- recode_values(x, list(a = "x", `b, c` = "y")) expect_equal( as.character(out), c("y", "y", "y", "y", "y", "y", "y", "y", "y", "x", NA, "y", "y", "x", "y"), ignore_attr = TRUE ) out <- recode_values(x, list(a = "x", b = NA)) expect_equal( as.character(out), c("c", "c", "c", NA, "c", NA, NA, NA, "c", "x", NA, NA, NA, "x", NA), ignore_attr = TRUE ) out <- recode_values(x, list(a = "x", b = "y"), default = "zz") expect_equal( as.character(out), c( "zz", "zz", "zz", "y", "zz", "y", "y", "y", "zz", "x", NA, "y", "y", "x", "y" ), ignore_attr = TRUE ) out <- recode_values( x, list(a = "x", b = "y"), default = "zz", preserve_na = FALSE ) expect_equal( as.character(out), c( "zz", "zz", "zz", "y", "zz", "y", "y", "y", "zz", "x", "zz", "y", "y", "x", "y" ), ignore_attr = TRUE ) }) # character ----------------------- set.seed(123) x <- as.character(sample(c("a", "b", "c"), 15, TRUE)) test_that("recode character", { out <- recode_values(x, list(a = "x", `b, c` = "y")) expect_equal( out, c( "y", "y", "y", "y", "y", "y", "y", "y", "y", "x", "y", "y", "x", "y", "y" ), ignore_attr = TRUE ) }) # data frame ----------------------- set.seed(123) d <- data.frame( x = sample(c(1:4, NA), 15, TRUE), y = as.factor(sample(c("a", "b", "c"), 15, TRUE)), stringsAsFactors = FALSE ) test_that("recode data.frame", { out <- recode_values( d, recode = list(`1` = 0, `2:3` = 1, `4` = 2, a = "x", `b, c` = "y") ) expect_equal( out, structure( list( x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), y = structure( c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L), .Label = c("x", "y"), class = "factor" ) ), row.names = c(NA, 15L), class = "data.frame" ), ignore_attr = TRUE ) out <- recode_values( d, recode = list(`1` = 0, `2:3` = 1, `4` = 2, a = "x", `b, c` = "y") ) expect_equal( out, structure( list( x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), y = structure( c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L), .Label = c("x", "y"), class = "factor" ) ), row.names = c(NA, 15L), class = "data.frame" ), ignore_attr = TRUE ) out <- recode_values( d, recode = list(`1` = 0, `2:3` = 1, `4` = 2, a = "x", `b, c` = "y"), select = is.numeric() ) expect_equal( out, structure( list( x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), y = structure( c(1L, 1L, 1L, 3L, 2L, 3L, 2L, 1L, 2L, 3L, 2L, 1L, 3L, 3L, 1L), .Label = c("a", "b", "c"), class = "factor" ) ), row.names = c(NA, 15L), class = "data.frame" ), ignore_attr = TRUE ) }) # set recode pattern back to default -------------- options(data_recode_pattern = NULL) set.seed(123) x <- sample(c(1:4, NA), 15, TRUE) test_that("recode numeric", { out <- recode_values(x, list(`0` = 1, `1` = 2:3, `2` = 4)) expect_equal( out, c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), ignore_attr = TRUE ) out <- recode_values( x, list(`0` = 1, `1` = 2:3, `2` = 4, `9` = NA), preserve_na = FALSE ) expect_equal( out, c(1, 1, 1, 1, 1, 9, 2, 0, 1, 1, 9, 1, 1, 0, 2), ignore_attr = TRUE ) out <- recode_values( x, list(`0` = 1, `1` = 2:3, `2` = 4, `9` = NA), preserve_na = TRUE ) expect_equal( out, c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), ignore_attr = TRUE ) }) test_that("recode, recode-arg is named list", { expect_warning(expect_identical( recode_values(x, recode = c(`0` = 1, `1` = 2:3, `2` = 4)), x )) }) set.seed(123) x <- as.factor(sample(c("a", "b", "c"), 15, TRUE)) test_that("recode factor", { out <- recode_values(x, list(x = "a", y = "b, c")) expect_equal( out, structure( c( 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L ), .Label = c("x", "y"), class = "factor" ), ignore_attr = TRUE ) out <- recode_values(x, list(x = "a", y = c("b", "c"))) expect_equal( out, structure( c( 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L ), .Label = c("x", "y"), class = "factor" ), ignore_attr = TRUE ) }) test_that("recode, recode-arg is named list", { expect_warning(expect_identical( recode_values(x, recode = c(x = "a", y = "b, c")), x )) }) set.seed(123) d <- data.frame( x = sample(c(1:4, NA), 15, TRUE), y = as.factor(sample(c("a", "b", "c"), 15, TRUE)), stringsAsFactors = FALSE ) test_that("recode data.frame", { out <- recode_values( d, recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = "b, c") ) expect_equal( out, structure( list( x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), y = structure( c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L), .Label = c("x", "y"), class = "factor" ) ), row.names = c(NA, 15L), class = "data.frame" ), ignore_attr = TRUE ) out <- recode_values( d, recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = c("b", "c")) ) expect_equal( out, structure( list( x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), y = structure( c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L), .Label = c("x", "y"), class = "factor" ) ), row.names = c(NA, 15L), class = "data.frame" ), ignore_attr = TRUE ) out <- recode_values( d, recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = c("b", "c")), select = is.numeric() ) expect_equal( out, structure( list( x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2), y = structure( c(1L, 1L, 1L, 3L, 2L, 3L, 2L, 1L, 2L, 3L, 2L, 1L, 3L, 3L, 1L), .Label = c("a", "b", "c"), class = "factor" ) ), row.names = c(NA, 15L), class = "data.frame" ), ignore_attr = TRUE ) }) # select helpers ------------------------------ test_that("recode_values regex", { expect_identical( recode_values( iris, select = "ies", regex = TRUE, recode = list( Group1 = "setosa", Group2 = "versicolor", Group3 = "virginica" ) ), recode_values( iris, select = "Species", recode = list( Group1 = "setosa", Group2 = "versicolor", Group3 = "virginica" ) ) ) }) ================================================ FILE: tests/testthat/test-data_relocate.R ================================================ test_that("data_relocate works as expected", { expect_error( data_relocate(iris, select = "Species", before = 2, after = 3), "You must supply only one of `before` or `after`." ) expect_error( data_relocate(iris, select = "Species", before = 10), "No valid position defined in `before`." ) expect_error( data_relocate(iris, select = "Species", after = 10), "No valid position defined in `after`." ) expect_named( data_relocate(iris, select = "Species", before = "Sepal.Length"), c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_named( data_relocate(iris, select = "Species", before = "Sepal.Width"), c("Sepal.Length", "Species", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_named( data_relocate(iris, select = "Sepal.Width", after = "Species"), names(data_relocate(iris, select = "Sepal.Width", after = -1)) ) expect_named( data_relocate( iris, select = c("Species", "Petal.Length"), after = "Sepal.Width" ), names(data_relocate(iris, select = c("Species", "Petal.Length"), after = 2)) ) }) test_that("data_relocate select-helpers", { expect_identical( colnames(data_relocate(iris, select = starts_with("Sepal"), after = 5)), colnames(iris[c(3:5, 1:2)]) ) expect_identical( colnames(data_relocate(iris, select = 1:2, after = 5)), colnames(iris[c(3:5, 1:2)]) ) expect_identical( colnames(data_relocate(iris, select = -1)), colnames(iris[c(2:5, 1)]) ) expect_identical( colnames(data_relocate(iris, select = Species, after = 1)), colnames(iris[c(1, 5, 2:4)]) ) expect_identical( colnames(data_relocate(iris, select = ~ Sepal.Width + Species)), colnames(iris[c(2, 5, 1, 3:4)]) ) expect_identical( colnames(data_relocate(iris, select = starts_with("sepal"), after = 5)), colnames(iris) ) expect_identical( colnames(data_relocate( iris, select = starts_with("sepal"), after = 5, ignore_case = TRUE )), colnames(iris[c(3:5, 1:2)]) ) }) # preserve attributes -------------------------- test_that("data_relocate preserves attributes", { skip_if_not_installed("parameters") m <- lm(Sepal.Length ~ Species, data = iris) out <- parameters::parameters(m) a1 <- attributes(out) out2 <- data_relocate(out, 4:6) a2 <- attributes(out2) # attributes may not be in the same order expect_true(all(names(a1) %in% names(a2))) expect_identical(length(a1), length(a2)) }) # select helpers ------------------------------ test_that("data_relocate regex", { expect_identical( names(data_relocate(mtcars, select = "pg", regex = TRUE, after = "carb"))[ 11 ], "mpg" ) }) # fuzzy matching ------------------------------ out <- data.frame( Parameter = "Test", Median = 0.5, CI_low = 0.4, CI_high = 0.6, pd = 0.97, Rhat = 0.99, ESS = 1000, log_BF = 3, stringsAsFactors = FALSE ) test_that("data_relocate misspelled", { # close match expect_error( data_relocate(out, "pd", before = "BF"), "log_BF" ) # close multiple matches expect_error( data_relocate(out, "pd", before = "CIl"), "CI_low" ) # not even close expect_error( data_relocate(out, "pd", before = "xyz"), "misspelled" ) }) ================================================ FILE: tests/testthat/test-data_remove.R ================================================ test_that("data_remove works as expected", { expect_identical( data_remove(BOD, "Time"), structure( list(demand = c(8.3, 10.3, 19, 16, 15.6, 19.8)), class = "data.frame", row.names = c(NA, 6L), reference = "A1.4, p. 270" ) ) }) test_that("data_remove works with NSE", { expect_named( data_remove(iris, starts_with("Sepal")), c("Petal.Length", "Petal.Width", "Species") ) expect_named( data_remove(iris, "Sepal"), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) expect_named( data_remove(iris, c("Sepal.Length", "Sepal.Width")), c("Petal.Length", "Petal.Width", "Species") ) expect_named( data_remove(iris, regex("\\.")), "Species" ) expect_named( data_remove(iris, Sepal.Width:Petal.Width), c("Sepal.Length", "Species") ) expect_named( data_remove(iris, contains("Sep")), c("Petal.Length", "Petal.Width", "Species") ) expect_named( data_remove(iris, contains("sep")), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) expect_named( data_remove(iris, contains("sep"), ignore_case = TRUE), c("Petal.Length", "Petal.Width", "Species") ) expect_named( data_remove(iris, 1:3), c("Petal.Width", "Species") ) expect_identical( colnames(data_remove(iris, c(1, 5))), colnames(iris)[2:4] ) expect_identical( colnames(data_remove(iris, -1:-2)), colnames(iris)[1:2] ) expect_identical( colnames(data_remove(iris, c(1, 4:5))), colnames(iris)[2:3] ) expect_identical( colnames(data_remove(iris, "abc")), colnames(iris) ) expect_named( data_remove(iris, "Species"), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_named( data_remove(iris, "species"), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) expect_named( data_remove(iris, "species", ignore_case = TRUE), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) }) test_that("data_remove from other functions", { test_fun <- function(data, i) { data_remove(data, select = i) } expect_named( test_fun(iris, c("Sepal.Length", "Sepal.Width")), c("Petal.Length", "Petal.Width", "Species") ) }) # preserve attributes -------------------------- test_that("data_remove preserves attributes", { skip_if_not_installed("parameters") m <- lm(Sepal.Length ~ Species, data = iris) out <- parameters::parameters(m) a1 <- attributes(out) out2 <- data_remove(out, "SE") a2 <- attributes(out2) # attributes may not be in the same order expect_true(all(names(a1) %in% names(a2))) expect_identical(length(a1), length(a2)) }) # select helpers ------------------------------ test_that("data_remove regex", { expect_identical( names(data_remove(mtcars, select = "pg", regex = TRUE)), names(mtcars[-(1)]) ) }) ================================================ FILE: tests/testthat/test-data_rename.R ================================================ test <- head(iris) # basic tests -------------- test_that("data_rename works with one or several replacements", { expect_named( data_rename(test, "Sepal.Length", "length"), c("length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) expect_named( data_rename( test, c("Sepal.Length", "Sepal.Width"), c("length", "width") ), c("length", "width", "Petal.Length", "Petal.Width", "Species") ) expect_named( data_rename(test, c(length = "Sepal.Length", width = "Sepal.Width")), c("length", "width", "Petal.Length", "Petal.Width", "Species") ) }) test_that("data_rename cannot have a partially named vector", { expect_error( data_rename(test, c(length = "Sepal.Length", "Sepal.Width")), "all elements must" ) }) test_that("data_rename returns a data frame", { x <- data_rename(test, "Sepal.Length", "length") expect_s3_class(x, "data.frame") }) test_that("data_rename: multiple selection types", { expect_named( data_rename(test, select = 1, "foo"), c("foo", names(iris)[2:5]) ) expect_named( data_rename(test, select = regex("tal"), c("foo1", "foo2")), c("Sepal.Length", "Sepal.Width", "foo1", "foo2", "Species") ) }) test_that("data_rename: replacement not allowed to have NA or empty strings", { expect_error( data_rename( test, select = c("Species", "Sepal.Length"), replacement = c("foo", NA_character_) ), regexp = "`replacement` is not allowed" ) }) # replacement ------------- test_that("data_rename errors when no replacement", { expect_error( data_rename(test, select = c("Sepal.Length", "Petal.Length")), "There are more names in `select` than in `replacement`" ) }) test_that("data_rename errors when too many names in 'replacement'", { expect_error( data_rename(test, replacement = paste0("foo", 1:6)), "There are more names in `replacement` than in `select`" ) }) test_that("data_rename works when not enough names in 'replacement'", { expect_error( data_rename(test, replacement = paste0("foo", 1:2)), "There are more names in `select` than in `replacement`" ) }) # no select -------------- test_that("data_rename errors when select = NULL", { expect_error( data_rename(test), "more names in `select`" ) }) # other -------------- test_that("data_rename deals correctly with duplicated replacement", { x <- data_rename( test, select = names(test)[1:4], replacement = c("foo", "bar", "foo", "bar") ) expect_identical(dim(test), dim(x)) expect_named(x[1:4], c("foo", "bar", "foo.2", "bar.2")) }) # preserve attributes -------------------------- test_that("data_rename preserves attributes", { skip_if_not_installed("parameters") m <- lm(Sepal.Length ~ Species, data = iris) out <- parameters::parameters(m) a1 <- attributes(out) out2 <- data_rename(out, "p", "p-val") a2 <- attributes(out2) expect_named(a1, names(a2)) }) # glue-styled select -------------------------- test_that("data_rename glue-style", { data(mtcars) out <- data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "formerly_{col}") expect_named(out, c("formerly_mpg", "formerly_cyl", "formerly_disp")) out <- data_rename( mtcars[1:3], c("mpg", "cyl", "disp"), "{col}_is_column_{n}" ) expect_named(out, c("mpg_is_column_1", "cyl_is_column_2", "disp_is_column_3")) out <- data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "new_{letter}") expect_named(out, c("new_a", "new_b", "new_c")) }) test_that("data_rename enough letters", { data(efc, package = "datawizard") data(mtcars) data(iris) data(ChickWeight) data(ToothGrowth) data(USArrests) data(airquality) x <- cbind( mtcars[1:5, ], iris[1:5, ], efc[1:5, ], ChickWeight[1:5, ], ToothGrowth[1:5, ], USArrests[1:5, ], airquality[1:5, ] ) expect_named( data_rename(x, replacement = "long_letter_{letter}"), c( "long_letter_a1", "long_letter_b1", "long_letter_c1", "long_letter_d1", "long_letter_e1", "long_letter_f1", "long_letter_g1", "long_letter_h1", "long_letter_i1", "long_letter_j1", "long_letter_k1", "long_letter_l1", "long_letter_m1", "long_letter_n1", "long_letter_o1", "long_letter_p1", "long_letter_q1", "long_letter_r1", "long_letter_s1", "long_letter_t1", "long_letter_u1", "long_letter_v1", "long_letter_w1", "long_letter_x1", "long_letter_y1", "long_letter_z1", "long_letter_a2", "long_letter_b2", "long_letter_c2", "long_letter_d2", "long_letter_e2", "long_letter_f2", "long_letter_g2", "long_letter_h2", "long_letter_i2", "long_letter_j2", "long_letter_k2", "long_letter_l2" ) ) }) skip_if_not_installed("withr") withr::with_environment( new.env(), test_that("data_rename glue-style, environment", { data(mtcars) x <- c("hi", "there", "!") out <- data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}") expect_named(out, c("col_hi", "col_there", "col_!")) expect_error( data_rename(mtcars[1:3], c("mpg", "disp"), "col_{x}"), regex = "The number of values" ) }) ) withr::with_environment( new.env(), test_that("data_rename glue-style, object not in environment", { data(mtcars) expect_error( data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}"), regex = "The object" ) }) ) withr::with_environment( new.env(), test_that("data_rename glue-style, function in environment", { data(mtcars) my_fun <- function(cols_to_rename) { data_rename(head(mtcars)[, 1:6], cols_to_rename, "new_{col}") } expect_named( my_fun(c("mpg", "drat")), c("new_mpg", "cyl", "disp", "hp", "new_drat", "wt") ) expect_named( my_fun("mpg"), c("new_mpg", "cyl", "disp", "hp", "drat", "wt") ) }) ) test_that("works with lists", { result <- list(x = 1, y = 2) expect_error( data_rename(result, select = names(result), replacement = c("a", "b")), regex = "must be a data frame" ) }) ================================================ FILE: tests/testthat/test-data_reorder.R ================================================ test_that("data_reorder works as expected", { expect_named( data_reorder(iris, c("Species", "Sepal.Length")), c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_warning(expect_named( data_reorder(iris, c("Species", "dupa")), c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") )) }) # preserve attributes -------------------------- test_that("data_reorder preserves attributes", { skip_if_not_installed("parameters") m <- lm(Sepal.Length ~ Species, data = iris) out <- parameters::parameters(m) a1 <- attributes(out) out2 <- data_reorder(out, 4:6) a2 <- attributes(out2) # attributes may not be in the same order expect_true(all(names(a1) %in% names(a2))) expect_length(a1, length(a2)) }) ================================================ FILE: tests/testthat/test-data_replicate.R ================================================ test_that("data_replicate: simple use case", { data(mtcars) d <- head(mtcars) out <- data_replicate(d, "carb") expect_identical(dim(out), c(13L, 10L)) expect_identical( out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225) ) expect_named( out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear") ) out <- data_replicate(d, 11) expect_identical(dim(out), c(13L, 10L)) expect_identical( out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225) ) expect_named( out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear") ) d$mpg[5] <- NA out <- data_replicate(d, "carb") expect_identical(dim(out), c(13L, 10L)) expect_identical( out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 22.8, 21.4, NA, NA, 18.1) ) expect_named( out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear") ) d$carb[3] <- NA out <- data_replicate(d, "carb", remove_na = TRUE) expect_identical(dim(out), c(12L, 10L)) expect_identical( out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 21.4, NA, NA, 18.1) ) expect_named( out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear") ) out <- data_replicate(d, "carb", select = c("disp", "hp"), remove_na = TRUE) expect_identical(dim(out), c(12L, 2L)) expect_identical( out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 258, 360, 360, 225) ) expect_named(out, c("disp", "hp")) d <- data.frame( a = c("a", "b", "c"), b = 1:3, rep = c(3, 2, 4), stringsAsFactors = FALSE ) out <- data_replicate(d, "rep") expect_identical(out$a, c("a", "a", "a", "b", "b", "c", "c", "c", "c")) }) test_that("data_replicate: errors", { data(mtcars) d <- head(mtcars) expect_error(data_replicate(d), regex = "No column") expect_error( data_replicate(d, expand = c("mpg", "gear")), regex = "a single string" ) expect_error( data_replicate(d, expand = "geas"), regex = "The column provided" ) expect_error( data_replicate(d, expand = "qsec"), regex = "The column provided" ) d$carb[3] <- NA expect_error(data_replicate(d, "carb"), regex = "missing values") d <- head(mtcars) d$carb[3] <- Inf expect_error(data_replicate(d, "carb"), regex = "infinite values") }) test_that("data_replicate: don't simplify if only one column left", { a <- c(1, 2, 3, 4) b <- c(4, 3, 2, 1) nrtimes <- c(1, 2, 0, 1) d <- data.frame(a, b, nrtimes) out <- data_replicate(d, expand = "nrtimes") expect_identical(dim(out), c(4L, 2L)) d <- data.frame(a, nrtimes) out <- data_replicate(d, expand = "nrtimes") expect_identical(dim(out), c(4L, 1L)) }) ================================================ FILE: tests/testthat/test-data_rescale.R ================================================ test_that("rescale works as expected", { expect_equal( rescale(c(0, 1, 5, -5, -2), to = NULL), c(0, 1, 5, -5, -2), ignore_attr = TRUE ) expect_equal( rescale(rep(NA_real_, 3)), rep(NA_real_, 3), ignore_attr = TRUE ) expect_message(rescale(iris$Species)) expect_equal( rescale(c(0, 1, 5, -5, -2)), c(50, 60, 100, 0, 30), ignore_attr = TRUE ) expect_equal( rescale(c(0, 1, 5, -5, -2), to = c(-5, 5)), c(0, 1, 5, -5, -2), ignore_attr = TRUE ) expect_equal( rescale(c(1, 3, 4), to = c(0, 40), range = c(0, 4)), c(10, 30, 40), ignore_attr = TRUE ) expect_snapshot(head(rescale(iris, to = c(0, 1)))) expect_snapshot(head(rescale(iris, to = c(0, 1), select = "Sepal.Length"))) expect_snapshot( head(rescale( iris, to = list( Sepal.Length = c(0, 1), Petal.Length = c(-1, 0) ) )) ) }) test_that("rescale works with select helpers", { out <- rescale(iris, to = c(0, 1), select = c("Sepal.Width", "Sepal.Length")) expect_equal( head(out$Sepal.Width), c(0.625, 0.41667, 0.5, 0.45833, 0.66667, 0.79167), tolerance = 1e-3 ) expect_equal( head(out$Petal.Length), head(iris$Petal.Length), tolerance = 1e-3 ) # check class attributes expect_identical( vapply(out, class, character(1)), c( Sepal.Length = "numeric", Sepal.Width = "numeric", Petal.Length = "numeric", Petal.Width = "numeric", Species = "factor" ) ) out <- rescale(iris, to = c(0, 1), select = starts_with("Sepal")) expect_equal( head(out$Sepal.Width), c(0.625, 0.41667, 0.5, 0.45833, 0.66667, 0.79167), tolerance = 1e-3 ) expect_equal( head(out$Petal.Length), head(iris$Petal.Length), tolerance = 1e-3 ) skip_if_not_installed("poorman") x <- poorman::group_by(iris, Species) out <- rescale(x, to = c(0, 1), select = starts_with("Sepal")) expect_equal( head(out$Sepal.Width), c(0.57143, 0.33333, 0.42857, 0.38095, 0.61905, 0.7619), tolerance = 1e-3 ) expect_equal( head(out$Petal.Length), head(iris$Petal.Length), tolerance = 1e-3 ) }) # grouped df ------------------------------ test_that("rescale works grouped df and append", { out <- rescale( iris, to = c(0, 1), select = c("Sepal.Width", "Sepal.Length"), append = TRUE ) expect_equal( head(out$Sepal.Width_r), c(0.625, 0.41667, 0.5, 0.45833, 0.66667, 0.79167), tolerance = 1e-3 ) expect_equal( head(out$Petal.Length), head(iris$Petal.Length), tolerance = 1e-3 ) expect_identical( colnames(out), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "Sepal.Width_r", "Sepal.Length_r" ) ) skip_if_not_installed("poorman") x <- poorman::group_by(iris, Species) out <- rescale(x, to = c(0, 1), select = starts_with("Sepal"), append = TRUE) expect_equal( head(out$Sepal.Width_r), c(0.57143, 0.33333, 0.42857, 0.38095, 0.61905, 0.7619), tolerance = 1e-3 ) expect_equal( head(out$Petal.Length), head(iris$Petal.Length), tolerance = 1e-3 ) expect_identical( colnames(out), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "Sepal.Length_r", "Sepal.Width_r" ) ) }) # select helpers ------------------------------ test_that("data_rescale regex", { expect_equal( rescale(mtcars, select = "pg", regex = TRUE)$mpg, rescale(mtcars, select = "mpg")$mpg, ignore_attr = TRUE ) }) # expanding range ------------------------------ test_that("data_rescale can expand range", { # for vectors x <- 5:15 expect_equal( rescale(x, multiply = 1.1), c(4.5, 5.6, 6.7, 7.8, 8.9, 10, 11.1, 12.2, 13.3, 14.4, 15.5), ignore_attr = TRUE ) expect_equal( rescale(x, multiply = 1.1), rescale(x, add = 0.5), ignore_attr = TRUE ) expect_error(rescale(x, multiply = 0.9, add = 1), regex = "Only one of") expect_error(rescale(x, multiply = c(1.2, 1.4)), regex = "The length of") # different values for add expect_equal( rescale(x, add = c(1, 3)), c(4, 5.4, 6.8, 8.2, 9.6, 11, 12.4, 13.8, 15.2, 16.6, 18), ignore_attr = TRUE ) expect_error(rescale(x, add = 1:3), regex = "The length of") # works with NA expect_equal( rescale(rep(NA_real_, 3), multiply = 1.1), rep(NA_real_, 3), ignore_attr = TRUE ) expect_equal( rescale(rep(NA_real_, 3), add = 2), rep(NA_real_, 3), ignore_attr = TRUE ) # for data frames d <- data.frame(x = 5:15, y = 5:15) expect_equal( rescale(d, multiply = 1.1), rescale(d, add = 0.5), ignore_attr = TRUE ) expect_equal( rescale(d, multiply = list(x = 1.1, y = 0.5)), rescale(d, add = list(x = 0.5, y = -2.5)), ignore_attr = TRUE ) # data frames accept multiple add-values per column out <- rescale(d, add = list(x = c(1, 3), y = c(2, 4))) expect_equal( out$x, rescale(d$x, add = c(1, 3)), ignore_attr = TRUE ) expect_equal( out$y, rescale(d$y, add = c(2, 4)), ignore_attr = TRUE ) expect_error(rescale(d, multiply = 0.9, add = 1), regex = "Only one of") expect_error( rescale(d, multiply = list(x = 0.9, y = 2), add = list(y = 1)), regex = "Only one of" ) expect_error(rescale(d, multiply = c(0.9, 1.5)), regex = "The length of") }) ================================================ FILE: tests/testthat/test-data_restoretype.R ================================================ test_that("data_restoretype works with reference", { data <- data.frame( Sepal.Length = c("1", "3", "2"), Species = c("setosa", "versicolor", "setosa"), New = c("1", "3", "4"), stringsAsFactors = FALSE ) fixed <- data_restoretype(data, reference = iris) expect_equal(typeof(fixed$Species), typeof(iris$Species)) expect_equal(typeof(fixed$Sepal.Length), typeof(iris$Sepal.Length)) expect_equal(typeof(fixed$New), "character") }) test_that("data_restoretype works without reference", { data <- data.frame( Sepal.Length = c("1", "3", "2"), Species = c("setosa", "versicolor", "setosa"), New = c("1", "3", "4"), stringsAsFactors = FALSE ) expect_equal( data_restoretype(data, reference = NULL), data.frame( Sepal.Length = c(1, 3, 2), Species = c("setosa", "versicolor", "setosa"), New = c(1, 3, 4), stringsAsFactors = FALSE ) ) }) ================================================ FILE: tests/testthat/test-data_reverse.R ================================================ # explanation of how reverse works: # https://github.com/easystats/datawizard/issues/106#issuecomment-1066628399 test_that("reverse works with numeric", { expect_identical( reverse(1:5), as.double(5:1) ) expect_identical( reverse(-2:2), as.double(2:-2) ) }) test_that("reverse works with factor", { expect_identical( reverse(factor(1:5)), factor(5:1) ) expect_identical( reverse(factor(-2:2)), factor(2:-2) ) }) test_that("reverse works with data frame", { test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse(test, select = "x"), data.frame( x = as.double(5:1), y = c(3, 8, 2, 5, 1) ) ) expect_identical( reverse(test, exclude = "x"), data.frame( x = 1:5, y = c(6, 1, 7, 4, 8) ) ) expect_identical( reverse(test), data.frame( x = as.double(5:1), y = c(6, 1, 7, 4, 8) ) ) }) test_that("reverse works with data frame and append", { test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse(test, select = "x", append = TRUE), data.frame( x = 1:5, y = c(3, 8, 2, 5, 1), x_r = as.double(5:1) ) ) expect_identical( reverse(test, append = TRUE), data.frame( x = 1:5, y = c(3, 8, 2, 5, 1), x_r = as.double(5:1), y_r = c(6, 1, 7, 4, 8) ) ) }) test_that("reverse: arg 'select' works with formula", { test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse(test, select = ~x), data.frame( x = as.double(5:1), y = c(3, 8, 2, 5, 1) ) ) expect_identical( reverse(test, select = ~ x + y), data.frame( x = as.double(5:1), y = c(6, 1, 7, 4, 8) ) ) }) test_that("reverse: arg 'exclude' works with formula", { test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse(test, exclude = ~x), data.frame( x = 1:5, y = c(6, 1, 7, 4, 8) ) ) expect_identical( reverse(test, exclude = ~ x + y), test ) }) test_that("reverse: argument 'range' works", { expect_identical( reverse(c(1, 3, 4), range = c(0, 4)), c(3, 1, 0) ) expect_identical( reverse(factor(c(1, 2, 3, 4, 5)), range = 0:10), factor(9:5, levels = 0:10) ) test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse(test, select = "x", range = c(0, 8)), data.frame( x = as.double(7:3), y = c(3, 8, 2, 5, 1) ) ) expect_identical( reverse(test, range = c(0, 8)), data.frame( x = as.double(7:3), y = c(5, 0, 6, 3, 7) ) ) }) test_that("reverse ignores NA", { expect_identical( reverse(c(1, 2, 8, NA)), c(8, 7, 1, NA) ) }) test_that("reverse returns NA if only NA provided", { expect_identical( reverse(c(NA_real_, NA_real_)), c(NA_real_, NA_real_) ) expect_identical( reverse(factor(c(NA, NA))), factor(c(NA, NA)) ) }) test_that("reverse warns if single value to reverse", { expect_warning( reverse(1), regexp = "A `range` must be provided for data with only one unique value." ) expect_warning( reverse(factor(1)), regexp = "A `range` must be provided for data with only one unique value." ) }) test_that("reverse msg for unsupported", { expect_message(reverse(as.Date(c("2022-04-24", "2022-04-23")))) }) # Same tests with reverse_scale (alias) -------------------------- test_that("reverse_scale works with numeric", { expect_identical( reverse_scale(1:5), as.double(5:1) ) expect_identical( reverse_scale(-2:2), as.double(2:-2) ) }) test_that("reverse_scale works with factor", { expect_identical( reverse_scale(factor(1:5)), factor(5:1) ) expect_identical( reverse_scale(factor(-2:2)), factor(2:-2) ) }) test_that("reverse_scale works with data frame", { test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse_scale(test, select = "x"), data.frame( x = as.double(5:1), y = c(3, 8, 2, 5, 1) ) ) expect_identical( reverse_scale(test, exclude = "x"), data.frame( x = 1:5, y = c(6, 1, 7, 4, 8) ) ) expect_identical( reverse_scale(test), data.frame( x = as.double(5:1), y = c(6, 1, 7, 4, 8) ) ) }) test_that("reverse_scale: arg 'select' works with formula", { test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse_scale(test, select = ~x), data.frame( x = as.double(5:1), y = c(3, 8, 2, 5, 1) ) ) expect_identical( reverse_scale(test, select = ~ x + y), data.frame( x = as.double(5:1), y = c(6, 1, 7, 4, 8) ) ) }) test_that("reverse_scale: arg 'exclude' works with formula", { test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse_scale(test, exclude = ~x), data.frame( x = 1:5, y = c(6, 1, 7, 4, 8) ) ) expect_identical( reverse_scale(test, exclude = ~ x + y), test ) }) test_that("reverse_scale: argument 'range' works", { expect_identical( reverse_scale(c(1, 3, 4), range = c(0, 4)), c(3, 1, 0) ) expect_identical( reverse_scale(factor(c(1, 2, 3, 4, 5)), range = 0:10), factor(9:5, levels = 0:10) ) test <- data.frame( x = 1:5, y = c(3, 8, 2, 5, 1) ) expect_identical( reverse_scale(test, select = "x", range = c(0, 8)), data.frame( x = as.double(7:3), y = c(3, 8, 2, 5, 1) ) ) expect_identical( reverse_scale(test, range = c(0, 8)), data.frame( x = as.double(7:3), y = c(5, 0, 6, 3, 7) ) ) }) test_that("reverse_scale ignores NA", { expect_identical( reverse_scale(c(1, 2, 8, NA)), c(8, 7, 1, NA) ) }) test_that("reverse_scale returns NA if only NA provided", { expect_identical( reverse_scale(c(NA_real_, NA_real_)), c(NA_real_, NA_real_) ) expect_identical( reverse_scale(factor(c(NA, NA))), factor(c(NA, NA)) ) }) test_that("reverse_scale warns if single value to reverse", { expect_warning( reverse_scale(1), regexp = "A `range` must be provided for data with only one unique value." ) expect_warning( reverse_scale(factor(1)), regexp = "A `range` must be provided for data with only one unique value." ) }) test_that("reverse_scale select helpers", { data(iris) out <- rescale( iris, to = list( Sepal.Length = c(0, 1), Petal.Length = c(-1, 0) ), select = ends_with("length") ) expect_identical(out$Sepal.Length, iris$Sepal.Length, tolerance = 1e-3) out <- rescale( iris, to = list( Sepal.Length = c(0, 1), Petal.Length = c(-1, 0) ), select = ends_with("length"), ignore_case = TRUE ) expect_identical( head(out$Sepal.Length), c(0.22222, 0.16667, 0.11111, 0.08333, 0.19444, 0.30556), tolerance = 1e-3 ) }) # with grouped data ------------------------------------------- set.seed(123) value1 <- sample(1:10, 6, replace = TRUE) set.seed(456) value2 <- sample(1:10, 6, replace = TRUE) test_df <- data.frame( id = rep(c("A", "B"), each = 3), value1 = value1, value2 = value2, stringsAsFactors = FALSE ) test_that("reverse works with data frames (grouped data)", { skip_if_not_installed("poorman") expect_identical( test_df %>% poorman::group_by(id) %>% reverse(exclude = "id") %>% poorman::ungroup(), data.frame( id = rep(c("A", "B"), each = 3), value1 = c(10, 10, 3, 6, 2, 3), value2 = c(4, 6, 3, 10, 6, 5), stringsAsFactors = FALSE ) ) }) test_that("reverse works with grouped data frames and append", { skip_if_not_installed("poorman") test <- data.frame( x = 1:6, y = c(3, 8, 2, 5, 1, 4), grp = rep(c("a", "b"), 3), stringsAsFactors = FALSE ) expect_identical( test %>% poorman::group_by(grp) %>% reverse(append = TRUE) %>% poorman::ungroup(), data.frame( x = 1:6, y = c(3, 8, 2, 5, 1, 4), grp = rep(c("a", "b"), 3), x_r = as.double(c(5, 6, 3, 4, 1, 2)), y_r = as.double(c(1, 4, 2, 7, 3, 8)), stringsAsFactors = FALSE ) ) }) set.seed(789) value1 <- sample(c(1:10, NA), 6, replace = TRUE) set.seed(10) value2 <- sample(c(1:10, NA), 6, replace = TRUE) test_df <- data.frame( id = rep(c("A", "B"), each = 3), value1 = value1, value2 = value2, stringsAsFactors = FALSE ) test_that("reverse works with data frames containing NAs (grouped data)", { skip_if_not_installed("poorman") expect_identical( test_df %>% poorman::group_by(id) %>% reverse(exclude = "id") %>% poorman::ungroup(), data.frame( id = rep(c("A", "B"), each = 3), value1 = c(10, 4, 4, 5, 3, 4), value2 = c(NA, 10, 9, 7, 6, 8), stringsAsFactors = FALSE ) ) }) # select helpers ------------------------------ test_that("reverse regex", { expect_identical( reverse(mtcars, select = "arb", regex = TRUE), reverse(mtcars, select = "carb") ) }) # work or give informative errors / warnings (#380) ------------------ test_that("reverse, larger range", { # works expect_identical( reverse(c(1, 3, 4), range = c(0, 4)), c(3, 1, 0) ) expect_identical( reverse(factor(c(1, 3, 4)), range = 0:4), structure( c(4L, 2L, 1L), levels = c("0", "1", "2", "3", "4"), class = "factor" ) ) expect_identical( reverse(factor(c(1, 3, 4)), range = c(0, 4)), structure( c(4L, 2L, 1L), levels = c("0", "1", "2", "3", "4"), class = "factor" ) ) # errors on invalid input expect_error(reverse(c(1, 3, 4), range = 0:4)) expect_error(reverse(factor(c(1, 3, 4, 5)), range = c(0, 2, 4))) # errors on invalid input (NA in range) expect_error(reverse(c(1, 3, 4), range = c(1, NA)), regex = "missing") expect_error( reverse(factor(letters[1:3]), range = c(1, NA)), regex = "missing" ) # warns expect_warning( reverse(factor(c("a", "b", "c")), range = c(1, 3, 5, 7)), regex = "No current" ) expect_warning( reverse(factor(c(9, 10, 11)), range = c(1, 3, 5, 7)), regex = "No current" ) expect_warning( reverse(factor(c(1, 3, 11)), range = c(1, 3, 5, 7)), regex = "Not all" ) # silent expect_silent(reverse( factor(c("a", "b", "c")), range = c(1, 3, 5, 7), verbose = FALSE )) expect_silent(reverse( factor(c(9, 10, 11)), range = c(1, 3, 5, 7), verbose = FALSE )) expect_silent(reverse( factor(c(1, 3, 11)), range = c(1, 3, 5, 7), verbose = FALSE )) # works as intended expect_identical( reverse(factor(c(1, 3, 11)), range = c(1, 3, 5, 7), verbose = FALSE), structure(c(4L, 3L, NA), levels = c("1", "3", "5", "7"), class = "factor") ) }) ================================================ FILE: tests/testthat/test-data_rotate.R ================================================ test_that("rotate data works as expected", { df <- mtcars[1:3, 1:4] expect_equal( data_rotate(df), structure( list( `Mazda RX4` = c(21, 6, 160, 110), `Mazda RX4 Wag` = c(21, 6, 160, 110), `Datsun 710` = c(22.8, 4, 108, 93) ), class = "data.frame", row.names = c("mpg", "cyl", "disp", "hp") ) ) expect_equal( data_rotate(df, rownames = "property"), structure( list( property = c("mpg", "cyl", "disp", "hp"), `Mazda RX4` = c(21, 6, 160, 110), `Mazda RX4 Wag` = c(21, 6, 160, 110), `Datsun 710` = c(22.8, 4, 108, 93) ), class = "data.frame", row.names = c(NA, 4L) ) ) expect_equal( data_rotate(df, colnames = TRUE), structure( list( `21` = c(6, 160, 110), `21` = c(6, 160, 110), `22.8` = c(4, 108, 93) ), class = "data.frame", row.names = c("cyl", "disp", "hp") ) ) expect_equal( data_rotate(df, rownames = "property", colnames = TRUE), structure( list( property = c("cyl", "disp", "hp"), `21` = c(6, 160, 110), `21` = c(6, 160, 110), `22.8` = c(4, 108, 93) ), class = "data.frame", row.names = c(NA, 3L) ) ) }) test_that("data_rotate, arg 'colnames' works", { df <- mtcars[1:3, 1:4] df <- rownames_as_column(df) expected <- data.frame( `Mazda RX4` = c(21, 6, 160, 110), `Mazda RX4 Wag` = c(21, 6, 160, 110), `Datsun 710` = c(22.8, 4, 108, 93), check.names = FALSE ) row.names(expected) <- c("mpg", "cyl", "disp", "hp") expect_identical( data_rotate(df, colnames = "rowname"), expected ) }) test_that("data_rotate warns if mixed types of data", { df <- mtcars[1:3, 1:4] df <- rownames_as_column(df) expect_warning( data_rotate(df), "mixed types of data" ) df$rowname <- factor(df$rowname) expect_warning( data_rotate(df), "mixed types of data" ) }) ================================================ FILE: tests/testthat/test-data_seek.R ================================================ test_that("data_seek - simple use case", { data(iris) out <- data_seek(iris, "Length") expect_identical(out$index, c(1L, 3L)) expect_identical(out$labels, c("Sepal.Length", "Petal.Length")) }) test_that("data_seek - seek label attribute", { data(efc) out <- data_seek(efc, "dependency") expect_identical(out$index, which(colnames(efc) == out$column)) expect_identical(out$labels, "elder's dependency") }) test_that("data_seek - seek label attribute", { data(efc) out <- data_seek(efc, "female") expect_identical(nrow(out), 0L) out <- data_seek(efc, "female", seek = "all") expect_identical(out$index, which(colnames(efc) == out$column)) expect_identical(out$labels, "elder's gender") }) test_that("data_seek - fuzzy match", { data(iris) out <- data_seek(iris, "Lenght") expect_identical(nrow(out), 0L) out <- data_seek(iris, "Lenght", fuzzy = TRUE) expect_identical(out$index, which(colnames(iris) %in% out$column)) expect_identical(out$labels, c("Sepal.Length", "Petal.Length")) }) test_that("data_seek - fuzzy match, value labels", { data(efc) out <- data_seek(efc, "femlae", seek = "all", fuzzy = TRUE) expect_identical(nrow(out), 1L) expect_identical(out$index, which(colnames(efc) %in% out$column)) expect_identical(out$labels, "elder's gender") }) test_that("data_seek - multiple pattern", { data(efc) out <- data_seek(efc, c("e16", "e42")) expect_identical(nrow(out), 2L) expect_identical(out$index, which(colnames(efc) %in% out$column)) expect_identical(out$labels, c("elder's gender", "elder's dependency")) # only one match, typo out <- data_seek(efc, c("femlae", "dependency")) expect_identical(nrow(out), 1L) expect_identical(out$index, which(colnames(efc) %in% out$column)) expect_identical(out$labels, "elder's dependency") # only one match, not searching in value labels out <- data_seek(efc, c("female", "dependency")) expect_identical(nrow(out), 1L) expect_identical(out$index, which(colnames(efc) %in% out$column)) expect_identical(out$labels, "elder's dependency") # two matches out <- data_seek(efc, c("female", "dependency"), seek = "all") expect_identical(nrow(out), 2L) expect_identical(out$index, which(colnames(efc) %in% out$column)) expect_identical(out$labels, c("elder's gender", "elder's dependency")) # only one match, typo out <- data_seek(efc, c("femlae", "dependency"), seek = "all") expect_identical(nrow(out), 1L) expect_identical(out$index, which(colnames(efc) %in% out$column)) expect_identical(out$labels, "elder's dependency") # two matches, despite typo out <- data_seek(efc, c("femlae", "dependency"), seek = "all", fuzzy = TRUE) expect_identical(nrow(out), 2L) expect_identical(out$index, which(colnames(efc) %in% out$column)) expect_identical(out$labels, c("elder's gender", "elder's dependency")) }) test_that("data_seek - valid input", { expect_error( data_seek(rnorm(10), "Length"), regex = "`data` must be a data frame." ) expect_error( data_seek(iris, "Length", seek = "somewhere"), regex = "`seek` must be" ) }) test_that("data_seek - print", { expect_snapshot(data_seek(iris, "Length")) expect_snapshot(data_seek(iris, "abc")) }) ================================================ FILE: tests/testthat/test-data_select.R ================================================ # input check --------------------- test_that("data_select checks for data frame", { expect_error(data_select(NULL), regexp = "provided") x <- list(a = 1:2, b = letters[1:3]) expect_error(data_select(x), regexp = "coerced") }) # select helpers --------------------- test_that("data_select works with select helpers", { expect_identical( data_select(iris, starts_with("Sepal")), iris[c("Sepal.Length", "Sepal.Width")] ) expect_identical( data_select(iris, ends_with("Width")), iris[c("Sepal.Width", "Petal.Width")] ) expect_identical( data_select(iris, regex("\\.")), iris[c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")] ) expect_identical( data_select(iris, contains("Wid")), iris[c("Sepal.Width", "Petal.Width")] ) }) # select helpers, negation --------------------- test_that("data_select works with negation of select helpers", { expect_identical( data_select(iris, -starts_with("Sepal")), iris[c("Petal.Length", "Petal.Width", "Species")] ) expect_identical( data_select(iris, -ends_with("Width")), iris[c("Sepal.Length", "Petal.Length", "Species")] ) }) # select-nse with function --------------------- test_that("data_select works with select-functions", { expect_identical( data_select(iris, is.numeric()), iris[sapply(iris, is.numeric)] ) expect_identical( data_select(iris, is.numeric), iris[sapply(iris, is.numeric)] ) expect_identical( data_select(iris, is.factor()), iris[sapply(iris, is.factor)] ) expect_identical( data_select(iris, is.factor), iris[sapply(iris, is.factor)] ) expect_warning(expect_null(data_select(iris, is.logical()))) }) # select-nse with user-function --------------------- testfun <- function(i) { is.numeric(i) && mean(i, na.rm = TRUE) > 3.5 } test_that("data_select works with user-defined select-functions", { expect_identical(data_select(iris, testfun), iris[sapply(iris, testfun)]) expect_identical(data_select(iris, -testfun), iris[!sapply(iris, testfun)]) testfun2 <- function(i) { is.numeric(i) && mean(i, na.rm = TRUE) < 5 } expect_identical( data_select(iris, select = testfun, exclude = testfun2), iris["Sepal.Length"] ) expect_identical( data_select(iris, select = testfun, exclude = -testfun2), iris["Petal.Length"] ) }) # select-nse with negation of functions --------------------- test_that("data_select works with negated select-functions", { expect_identical( data_select(iris, -is.numeric()), iris[sapply(iris, function(i) !is.numeric(i))] # nolint ) expect_identical( data_select(iris, -is.numeric), iris[sapply(iris, function(i) !is.numeric(i))] # nolint ) expect_identical( data_select(iris, -is.factor()), iris[sapply(iris, function(i) !is.factor(i))] # nolint ) expect_identical( data_select(iris, -is.factor), iris[sapply(iris, function(i) !is.factor(i))] # nolint ) expect_identical(data_select(iris, -is.logical), iris) }) # select-nse with ranges --------------------- test_that("data_select works with ranges", { expect_identical( data_select(iris, 2:3), iris[2:3] ) expect_identical( data_select(iris, Sepal.Width:Petal.Length), iris[2:3] ) }) # select-nse with negated ranges --------------------- test_that("data_select works with negated ranges", { expect_identical( data_select(iris, -(1:2)), iris[c(3, 4, 5)] ) expect_identical( data_select(iris, -1:-2), iris[c(3, 4, 5)] ) expect_identical( data_select(iris, exclude = -1:-2), iris[1:2] ) expect_identical( data_select(iris, exclude = 2:3), iris[c(1, 4, 5)] ) expect_error( data_select(iris, -Sepal.Width:Petal.Length), "can't mix negative and positive" ) expect_identical( data_select(iris, -(Sepal.Width:Petal.Length)), iris[c(1, 4, 5)] ) }) # select-nse with formulas --------------------- test_that("data_select works with formulas", { expect_identical( data_select(iris, ~ Sepal.Width + Petal.Length), iris[2:3] ) expect_identical( data_select(iris, exclude = ~ Sepal.Width + Petal.Length), iris[c(1, 4, 5)] ) }) # select-nse, other cases --------------------- test_that("data_select works, other cases", { expect_identical(data_select(iris), iris) expect_identical( data_select(iris, c("Petal.Width", "Sepal.Length")), iris[c("Petal.Width", "Sepal.Length")] ) expect_identical( data_select(iris, -c("Petal.Width", "Sepal.Length")), iris[setdiff(colnames(iris), c("Petal.Width", "Sepal.Length"))] ) expect_identical( data_select(iris, -Petal.Width), iris[setdiff(colnames(iris), "Petal.Width")] ) expect_identical( data_select(mtcars, c("am", "gear", "cyl")), mtcars[c("am", "gear", "cyl")] ) expect_identical( data_select(mtcars, c("vam", "gear", "cyl")), mtcars[c("gear", "cyl")] ) expect_warning(expect_null(data_select(mtcars, ends_with("abc")))) expect_identical( data_select(mtcars, regex("rb$")), mtcars["carb"] ) expect_identical( data_select(mtcars, regex("^c")), mtcars[c("cyl", "carb")] ) expect_warning(expect_null(data_select(mtcars, "^c"))) expect_identical( data_select(mtcars, regex("^C"), ignore_case = TRUE), mtcars[c("cyl", "carb")] ) }) # select-nse works when called from other function --------------------- test_that("data_select from other functions", { test_fun1 <- function(data, i) { data_select(data, select = i) } expect_identical( test_fun1(iris, c("Sepal.Length", "Sepal.Width")), iris[c("Sepal.Length", "Sepal.Width")] ) expect_identical( test_fun1(iris, starts_with("Sep")), iris[c("Sepal.Length", "Sepal.Width")] ) test_fun1a <- function(data, i) { data_select(data, select = i, regex = TRUE) } expect_identical( test_fun1a(iris, "Sep"), iris[c("Sepal.Length", "Sepal.Width")] ) test_fun1b <- function(data, i) { data_select(data, select = i, regex = TRUE) } expect_identical( test_fun1b(iris, "Width$"), iris[c("Sepal.Width", "Petal.Width")] ) test_fun1c <- function(data, i) { data_select(data, select = -i) } expect_identical( test_fun1c(iris, c("Sepal.Length", "Sepal.Width")), iris[c("Petal.Length", "Petal.Width", "Species")] ) test_fun2 <- function(data) { data_select(data, select = starts_with("Sep")) } expect_identical( test_fun2(iris), iris[c("Sepal.Length", "Sepal.Width")] ) test_fun3 <- function(data) { i <- "Sep" data_select(data, select = starts_with(i)) } expect_identical( test_fun3(iris), iris[, c("Sepal.Length", "Sepal.Width")] ) test_top <- function(x) { testfun1 <- function(i) { is.numeric(i) && mean(i, na.rm = TRUE) > 3.5 } testfun2 <- function(i) { is.numeric(i) && mean(i, na.rm = TRUE) < 5 } data_select(x, select = testfun, exclude = -testfun2) } expect_identical(test_top(iris), iris["Petal.Length"]) }) # preserve attributes -------------------------- test_that("data_select preserves attributes", { skip_if_not_installed("parameters") m <- lm(Sepal.Length ~ Species, data = iris) out <- parameters::parameters(m) a1 <- attributes(out) out2 <- data_select(out, 1:3) a2 <- attributes(out2) expect_identical(sort(names(a1)), sort(names(a2))) }) # Select helpers work in functions and loops test_that("select helpers work in functions and loops", { foo <- function(data, i) { extract_column_names(data, select = starts_with(i)) } expect_identical( foo(iris, "Sep"), c("Sepal.Length", "Sepal.Width") ) for (i in "Sepal") { x <- extract_column_names(iris, select = starts_with(i)) } expect_identical( x, c("Sepal.Length", "Sepal.Width") ) for (i in "Length") { x <- extract_column_names(iris, select = ends_with(i)) } expect_identical( x, c("Sepal.Length", "Petal.Length") ) }) test_that("select helpers work in functions and loops even if there's an object with the same name in the environment above", { i <- "Petal" foo <- function(data, i) { extract_column_names(data, select = starts_with(i)) } expect_identical( foo(iris, "Sep"), c("Sepal.Length", "Sepal.Width") ) for (i in "Sepal") { x <- extract_column_names(iris, select = starts_with(i)) } expect_identical( x, c("Sepal.Length", "Sepal.Width") ) i <- "Width" for (i in "Length") { x <- extract_column_names(iris, select = ends_with(i)) } expect_identical( x, c("Sepal.Length", "Petal.Length") ) }) test_that("old solution still works", { foo <- function(data) { i <- "Sep" extract_column_names(data, select = i, regex = TRUE) } expect_identical( foo(iris), c("Sepal.Length", "Sepal.Width") ) }) test_that("data_select renames variables on the fly", { data(mtcars) expect_named( data_select(mtcars, c(new = "mpg", old = "cyl", hoho = "wt")), c("new", "old", "hoho") ) expect_named( data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt")), c("new", "cyl", "hoho") ) expect_named( data_select(mtcars, c("mpg", "cyl", "wt")), c("mpg", "cyl", "wt") ) # don't fail for non-existing columns expect_named( data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt", test = "grea")), c("new", "cyl", "hoho") ) # check that excluded variables don't cause troubles expect_named( data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt"), exclude = "wt"), c("new", "cyl") ) # error when names are not unique expect_error( data_select(mtcars, c(new = "mpg", old = "cyl", new = "wt")), # nolint regex = "Following names are duplicated" ) expect_error( data_select(mtcars, c(new = "mpg", "cyl", cyl = "wt")), # nolint regex = "Following names are duplicated" ) # when new name is used in exclude, it should be ignored expect_named( data_select(mtcars, c(drat = "mpg"), exclude = "drat"), "drat" ) }) ================================================ FILE: tests/testthat/test-data_separate.R ================================================ test_that("data_separate: simple use case", { # simple case d_sep <- data.frame( x = c("1.a.6", "2.b.7", "3.c.8"), stringsAsFactors = FALSE ) expect_error(data_separate(d_sep), regex = "Either") # basic expect_silent(data_separate(d_sep, guess_columns = "mode", verbose = FALSE)) expect_silent({ out <- data_separate(d_sep, guess_columns = "mode") }) expect_identical(colnames(out), c("x_1", "x_2", "x_3")) expect_identical(out$x_1, c("1", "2", "3")) expect_identical(out$x_2, c("a", "b", "c")) # manual separator char out2 <- data_separate( d_sep, separator = "\\.", guess_columns = "mode", verbose = FALSE ) expect_identical(out, out2) # non-existing separator char expect_message( data_separate(d_sep, separator = "_", guess_columns = "mode"), regex = "Separator probably not found" ) # column names out <- data_separate( d_sep, new_columns = c("A1", "B2", "C3"), verbose = FALSE ) expect_identical(colnames(out), c("A1", "B2", "C3")) expect_identical(out$A1, c("1", "2", "3")) expect_identical(out$B2, c("a", "b", "c")) out <- data_separate(d_sep, new_columns = letters[1:3], append = TRUE) expect_equal( out, data.frame( x = c("1.a.6", "2.b.7", "3.c.8"), a = c("1", "2", "3"), b = c("a", "b", "c"), c = c("6", "7", "8"), stringsAsFactors = FALSE ), ignore_attr = TRUE ) }) test_that("data_separate: convert between data_unite and data_separate", { d_unite <- data.frame( x = as.character(c(NA, 1:3)), y = c(letters[1:3], NA_character_), z = as.character(6:9), m = c("X", NA_character_, "Y", "Z"), n = c("NATION", "COUNTRY", "NATION", NA_character_), stringsAsFactors = FALSE ) out1 <- data_unite(d_unite, new_column = "test") d_sep <- data_separate( out1, new_columns = c("x", "y", "z", "m", "n"), separator = "_" ) expect_identical(d_unite, d_sep) }) test_that("data_separate: different number of values", { d_sep <- data.frame( x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), stringsAsFactors = FALSE ) # basic use-case expect_silent(data_separate(d_sep, guess_columns = "mode", verbose = FALSE)) expect_message( expect_message( expect_message( data_separate(d_sep, guess_columns = "mode"), regex = "3 columns" ), regex = "have been dropped" ), regex = "filled with `NA`" ) out <- data_separate(d_sep, guess_columns = "mode", verbose = FALSE) expect_identical(colnames(out), c("x_1", "x_2", "x_3")) expect_identical(out$x_1, c("1", "2", "3", "5")) expect_identical(out$x_2, c("a", "b", "c", "j")) expect_identical(out$x_3, c("6", "7", "8", NA)) # fill missings left out <- data_separate( d_sep, guess_columns = "mode", fill = "left", verbose = FALSE ) expect_identical(colnames(out), c("x_1", "x_2", "x_3")) expect_identical(out$x_1, c("1", "2", "3", NA)) expect_identical(out$x_2, c("a", "b", "c", "5")) expect_identical(out$x_3, c("6", "7", "8", "j")) # merge extra right out <- data_separate( d_sep, guess_columns = "mode", extra = "merge_right", verbose = FALSE ) expect_identical(colnames(out), c("x_1", "x_2", "x_3")) expect_identical(out$x_1, c("1", "2", "3", "5")) expect_identical(out$x_2, c("a", "b", "c", "j")) expect_identical(out$x_3, c("6", "7 d", "8", NA)) # max columns out <- data_separate(d_sep, guess_columns = "max", verbose = FALSE) expect_equal( out, data.frame( x_1 = c("1", "2", "3", "5"), x_2 = c("a", "b", "c", "j"), x_3 = c("6", "7", "8", NA), x_4 = c(NA, "d", NA, NA), stringsAsFactors = FALSE ), ignore_attr = TRUE ) # min columns out <- data_separate(d_sep, guess_columns = "min", verbose = FALSE) expect_equal( out, data.frame( x_1 = c("1", "2", "3", "5"), x_2 = c("a", "b", "c", "j"), stringsAsFactors = FALSE ), ignore_attr = TRUE ) out <- data_separate( d_sep, guess_columns = "min", extra = "merge_left", verbose = FALSE ) expect_equal( out, data.frame( x_1 = c("1 a", "2 b 7", "3 c", "5"), x_2 = c("6", "d", "8", "j"), stringsAsFactors = FALSE ), ignore_attr = TRUE ) out <- data_separate( d_sep, guess_columns = "max", fill = "left", verbose = FALSE ) expect_equal( out, data.frame( x_1 = c(NA, "2", NA, NA), x_2 = c("1", "b", "3", NA), x_3 = c("a", "7", "c", "5"), x_4 = c("6", "d", "8", "j"), stringsAsFactors = FALSE ), ignore_attr = TRUE ) }) test_that("data_separate: multiple columns", { d_sep <- data.frame( x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), y = c("m.n.99", "77.f.g", "44.9", NA), stringsAsFactors = FALSE ) # select works out <- data_separate( d_sep, select = "x", guess_columns = "mode", verbose = FALSE ) expect_identical(colnames(out), c("y", "x_1", "x_2", "x_3")) expect_identical(out$x_1, c("1", "2", "3", "5")) expect_identical(out$x_2, c("a", "b", "c", "j")) expect_identical(out$x_3, c("6", "7", "8", NA)) out <- data_separate(d_sep, guess_columns = "mode", verbose = FALSE) expect_snapshot(out) out <- data_separate( d_sep, guess_columns = "mode", extra = "merge_right", verbose = FALSE ) expect_snapshot(out) out <- data_separate( d_sep, new_columns = c("A", "B", "C"), extra = "merge_right", verbose = FALSE ) expect_snapshot(out) out <- data_separate( d_sep, new_columns = c("A", "B", "C"), extra = "merge_right", append = TRUE, verbose = FALSE ) expect_snapshot(out) out <- data_separate( d_sep, guess_columns = "mode", extra = "drop_left", verbose = FALSE ) expect_snapshot(out) out <- data_separate( d_sep, new_columns = c("A", "B", "C"), fill = "value_right", extra = "merge_right", append = TRUE, verbose = FALSE ) expect_snapshot(out) out <- data_separate( d_sep, new_columns = c("A", "B", "C"), fill = "value_right", extra = "merge_right", merge_multiple = TRUE, append = TRUE, verbose = FALSE ) expect_snapshot(out) out <- data_separate( d_sep, new_columns = c("A", "B", "C"), merge_multiple = TRUE, append = TRUE, verbose = FALSE ) expect_snapshot(out) out <- data_separate( d_sep, guess_columns = "mode", fill = "value_left", verbose = FALSE ) expect_snapshot(out) }) test_that("data_separate: multiple columns, different lengths", { d_sep <- data.frame( x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), y = c("m.n.99.22", "77.f.g.34", "44.9", NA), stringsAsFactors = FALSE ) # separate column names out <- data_separate( d_sep, select = c("x", "y"), new_columns = list(x = c("A", "B", "C"), y = c("EE", "FF", "GG")), verbose = FALSE ) expect_named(out, c("A", "B", "C", "EE", "FF", "GG")) expect_snapshot(out) out <- data_separate( d_sep, select = c("x", "y"), new_columns = list(x = c("A", "B", "C"), y = c("EE", "FF", "GG", "HH")), verbose = FALSE ) expect_named(out, c("A", "B", "C", "EE", "FF", "GG", "HH")) expect_snapshot(out) }) test_that("data_separate: numeric separator", { d_sep <- data.frame( x = c("Thisisalongstring", "Doeshe1losteverything", "Wereme2longornot"), stringsAsFactors = FALSE ) expect_silent({ out <- data_separate( d_sep, guess_columns = "mode", separator = c(5, 7, 8, 12), verbose = TRUE ) }) expect_equal( out, data.frame( x_1 = c("This", "Does", "Were"), x_2 = c("is", "he", "me"), x_3 = c("a", "1", "2"), x_4 = c("long", "lost", "long"), x_5 = c("string", "everything", "ornot"), stringsAsFactors = FALSE ), ignore_attr = TRUE ) d_sep <- data.frame( x = c("Thisisalongstring", "Doeshe1losteverything"), y = c("Wereme2longornot", NA), stringsAsFactors = FALSE ) expect_silent({ out <- data_separate( d_sep, separator = c(5, 7, 8, 12), new_columns = LETTERS[1:5] ) }) expect_equal( out, data.frame( A = c("This", "Does"), B = c("is", "he"), C = c("a", "1"), D = c("long", "lost"), E = c("string", "everything"), A.1 = c("Were", NA), B.1 = c("me", NA), C.1 = c("2", NA), D.1 = c("long", NA), E.1 = c("ornot", NA), stringsAsFactors = FALSE ), ignore_attr = TRUE ) expect_error( data_separate( d_sep, separator = c(5, 7, 8, 12), new_columns = LETTERS[1:6] ), regex = "went wrong" ) }) test_that("data_separate: fail if invalid column selected", { d_sep <- data.frame( x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), y = c("m.n.99", "77.f.g", "44.9", NA), stringsAsFactors = FALSE ) expect_warning( expect_message( data_separate(d_sep, guess_columns = "mode", select = "z"), reg = "not found" ), regex = "misspelled?" ) expect_identical( data_separate(d_sep, guess_columns = "mode", select = "z", verbose = FALSE), d_sep ) expect_snapshot(data_separate(d_sep, guess_columns = "mode", select = NULL)) }) test_that("data_separate: numeric column", { d_sep <- data.frame( x = c(154353523, 535543532, 12342422, 15454334535), y = c("m.n.99", "77.f.g", "44.9", NA), stringsAsFactors = FALSE ) expect_message( data_separate(d_sep, guess_columns = "mode", select = "x"), regex = "Separator probably" ) out <- data_separate( d_sep, guess_columns = "mode", select = "x", separator = c(3, 6, 9) ) expect_snapshot(out) }) ================================================ FILE: tests/testthat/test-data_shift.R ================================================ # numeric test_that("slide", { x <- c(10, 11, 12) expect_identical(slide(x), c(0, 1, 2)) x <- c(10, 11, 12) expect_identical(slide(x, lowest = 10), x) x <- c(10, 11, 12) expect_identical(slide(x, lowest = 1), c(1, 2, 3)) x <- c(10, 11, NA, 12) expect_identical(slide(x, lowest = 1), c(1, 2, NA, 3)) }) # factor test_that("slide", { data(efc) expect_message(expect_identical(slide(efc$e42dep), efc$e42dep)) }) # data frame test_that("slide", { data(iris) expect_message( out <- slide(iris), # nolint "Shifting non-numeric variables is not possible" ) expect_identical(out$Species, iris$Species) expect_identical(range(out$Sepal.Length), c(0, 3.6), tolerance = 1e-2) }) # select helpers ------------------------------ test_that("slide regex", { expect_identical( slide(mtcars, select = "pg", regex = TRUE), slide(mtcars, select = "mpg") ) }) ================================================ FILE: tests/testthat/test-data_summary.R ================================================ test_that("data_summary, single row summary", { data(iris) out <- data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) expect_equal(out$MW, mean(iris$Sepal.Width), tolerance = 1e-4) expect_equal(out$SD, sd(iris$Sepal.Width), tolerance = 1e-4) }) test_that("data_summary, single row summary, string expression", { data(iris) out <- data_summary(iris, "MW = mean(Sepal.Width)", "SD = sd(Sepal.Width)") expect_equal(out$MW, mean(iris$Sepal.Width), tolerance = 1e-4) expect_equal(out$SD, sd(iris$Sepal.Width), tolerance = 1e-4) }) test_that("data_summary, summary for groups", { data(iris) out <- data_summary( iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = "Species" ) expect_equal( out$MW, aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width, tolerance = 1e-4 ) expect_equal( out$SD, aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width, tolerance = 1e-4 ) }) test_that("data_summary, summary for groups, string expression", { data(iris) out <- data_summary( iris, "MW = mean(Sepal.Width)", "SD = sd(Sepal.Width)", by = "Species" ) expect_equal( out$MW, aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width, tolerance = 1e-4 ) expect_equal( out$SD, aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width, tolerance = 1e-4 ) }) test_that("data_summary, grouped data frames", { data(iris) d <- data_group(iris, "Species") out <- data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) expect_equal( out$MW, aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width, tolerance = 1e-4 ) expect_equal( out$SD, aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width, tolerance = 1e-4 ) # "by" overrides groups data(mtcars) d <- data_group(mtcars, "gear") out <- data_summary(d, MW = mean(mpg), SD = sd(mpg), by = "am") expect_identical( out$MW, aggregate(mtcars["mpg"], list(mtcars$am), mean)$mpg ) }) test_that("data_summary, summary for multiple groups", { data(mtcars) out <- data_summary( mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear") ) expect_equal( out$MW, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg, tolerance = 1e-4 ) expect_equal( out$SD, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg, tolerance = 1e-4 ) x <- data_group(mtcars, c("am", "gear")) out <- data_summary(x, MW = mean(mpg), SD = sd(mpg)) expect_equal( out$MW, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg, tolerance = 1e-4 ) expect_equal( out$SD, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg, tolerance = 1e-4 ) }) test_that("data_summary, errors", { data(iris) data(mtcars) # "by" must be character expect_error( data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = 5), regex = "Argument `by` must be a character string" ) # "by" must be in data expect_error( data_summary( iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = "Speceis" ), regex = "Variable \"Speceis\" not" ) # by for multiple variables expect_error( data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("bam", "gear")), regex = "Variable \"bam\" not" ) expect_error( data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("bam", "geas")), regex = "Did you mean one of \"am\" or \"gear\"?" ) # not a data frame expect_error( data_summary( iris$Sepal.Width, MW = mean(Sepal.Width), SD = sd(Sepal.Width) ), regex = "only works for" ) # no expressions expect_error( data_summary(iris, by = "Species"), regex = "No expressions for calculating" ) # wrong expression expect_error( data_summary(mtcars, mw = mesn(mpg), by = "am"), regex = "There was an error" ) # wrong variable name expect_error( data_summary(mtcars, n = max(mpeg)), regex = "There was an error" ) # expression returns more than one value expect_error( data_summary( mtcars, n = unique(mpg), j = c(min(am), max(am)), by = c("am", "gear") ), regex = "Each expression must return" ) }) test_that("data_summary, values_at", { data(mtcars) out <- data_summary( mtcars, pos1 = mpg[1], pos_end = mpg[length(mpg)], by = c("am", "gear") ) # same as: # dplyr::summarise(mtcars, pos1 = dplyr::first(mpg), pos_end = dplyr::last(mpg), .by = c("am", "gear")) expect_equal(out$pos1, c(21.4, 24.4, 21, 26), tolerance = 1e-3) expect_equal(out$pos_end, c(19.2, 17.8, 21.4, 15), tolerance = 1e-3) }) test_that("data_summary, print", { data(mtcars) out <- data_summary( mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear") ) expect_snapshot(print(out)) }) test_that("data_summary, with NA", { data(efc, package = "datawizard") out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code") expect_snapshot(print(out)) out <- data_summary( efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", remove_na = TRUE ) expect_snapshot(print(out)) # sorting for multiple groups out <- data_summary( efc, MW = mean(c12hour, na.rm = TRUE), by = c("e42dep", "c172code") ) expect_snapshot(print(out)) }) test_that("data_summary, inside functions", { foo1 <- function(x, ...) { datawizard::data_summary(x, ..., by = "Species") } foo2 <- function(x, by, ...) { datawizard::data_summary(x, ..., by = by) } foo3 <- function(x, by) { datawizard::data_summary(x, MW = mean(Sepal.Width), by = by) } data(iris) out1 <- foo1(iris, MW = mean(Sepal.Width)) out2 <- foo2(iris, by = "Species", MW = mean(Sepal.Width)) out3 <- foo3(iris, "Species") expect_equal(out1$MW, out2$MW, tolerance = 1e-4) expect_equal(out1$MW, out3$MW, tolerance = 1e-4) }) test_that("data_summary, expression as variable", { data(mtcars) a <- "MW = mean(mpg)" b <- "SD = sd(mpg)" out <- data_summary(mtcars, a, by = c("am", "gear")) expect_named(out, c("am", "gear", "MW")) expect_equal( out$MW, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg, tolerance = 1e-4 ) expect_error( data_summary(mtcars, a, b, by = c("am", "gear")), regex = "You cannot mix" ) out <- data_summary(mtcars, c(a, b), by = c("am", "gear")) expect_named(out, c("am", "gear", "MW", "SD")) expect_equal( out$SD, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg, tolerance = 1e-4 ) }) test_that("data_summary, extra functions", { data(mtcars) # n() out <- data_summary(mtcars, n = n(), by = c("am", "gear")) expect_identical(out$n, c(15L, 4L, 8L, 5L)) }) test_that("data_summary, bayestestR::ci", { skip_if_not_installed("bayestestR") data(mtcars) out <- data_summary( mtcars, mean_value = mean(mpg), ci = bayestestR::ci(mpg), by = c("am", "gear") ) expect_named(out, c("am", "gear", "mean_value", "CI", "CI_low", "CI_high")) expect_snapshot(out) out <- data_summary( mtcars, mw = mean(mpg), test = bayestestR::ci(mpg), yolo = c(mean(mpg), sd(mpg)), by = c("am", "gear") ) expect_named( out, c("am", "gear", "mw", "CI", "CI_low", "CI_high", "yolo_1", "yolo_2") ) }) test_that("no warning when variable name and function in global env clash, #583", { dat <- data.frame(rt = 1:10) expect_silent(data_summary(dat, rt = mean(rt))) }) test_that("allow multiple columns for expressions", { set.seed(123) d <- data.frame( x = rnorm(100, 1, 1), y = rnorm(100, 2, 2), groups = rep(1:4, each = 25) ) out <- data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), quant_y = quantile(y, c(0.25, 0.75)), suffix = c("Q1", "Q3") ) expect_equal( out$quant_xQ1, 0.50615, tolerance = 1e-3, ignore_attr = TRUE ) expect_named(out, c("quant_xQ1", "quant_xQ3", "quant_yQ1", "quant_yQ3")) # automatic suffixes out <- data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), quant_y = quantile(y, c(0.1, 0.9)), suffix = NULL ) expect_named(out, c("quant_x25%", "quant_x75%", "quant_y10%", "quant_y90%")) # use own suffix only for one expression - other expressions are # suffixed with `_1`, `_2`, etc. out <- data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), quant_y = quantile(y, c(0.25, 0.5, 0.75)), mean_x = mean(x), suffix = list(quant_y = c("_Q1", "_Q2", "_Q3")), by = "groups" ) expect_named( out, c( "groups", "quant_x25%", "quant_x75%", "quant_y_Q1", "quant_y_Q2", "quant_y_Q3", "mean_x" ) ) set.seed(123) d <- data.frame( x = rnorm(100, 1, 1), y = rnorm(100, 2, 2), w = rnorm(100, 3, 0.5), z = rnorm(100, 4, 3), groups = rep(1:4, each = 25) ) out <- data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)) ) expect_equal( out, data.frame( `quant_x25%` = 0.50615, `quant_x75%` = 1.69182, mean_x = 1.09041, `quant_y25%` = 0.39779, `quant_y50%` = 1.54834, `quant_y75%` = 2.93569 ), tolerance = 1e-3, ignore_attr = TRUE ) out <- data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), fivenum = fivenum(y) ) expect_equal( out, data.frame( `quant_x25%` = 0.50615, `quant_x75%` = 1.69182, mean_x = 1.09041, fivenum_1 = -2.10649, fivenum_2 = 0.36539, fivenum_3 = 1.54834, fivenum_4 = 2.96837, fivenum_5 = 8.48208 ), tolerance = 1e-3, ignore_attr = TRUE ) out <- data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = list(quant_y = c("_Q1", "_Q2", "_Q3")) ) expect_equal( out, data.frame( `quant_x25%` = 0.50615, `quant_x75%` = 1.69182, mean_x = 1.09041, quant_y_Q1 = 0.39779, quant_y_Q2 = 1.54834, quant_y_Q3 = 2.93569 ), tolerance = 1e-3, ignore_attr = TRUE ) out <- data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q3")) ) expect_equal( out, data.frame( quant_xQ1 = 0.50615, quant_xQ3 = 1.69182, mean_x = 1.09041, quant_y_Q1 = 0.39779, quant_y_Q2 = 1.54834, quant_y_Q3 = 2.93569 ), tolerance = 1e-3, ignore_attr = TRUE ) out <- data_summary( d, quant_x = quantile(x, c(0.25, 0.5)), quant_w = quantile(w, c(0.25, 0.5)), quant_y = quantile(y, c(0.25, 0.5)), quant_z = quantile(z, c(0.25, 0.5)), suffix = c("_Q1", "_Q2") ) expect_equal( out, data.frame( quant_x_Q1 = 0.50615, quant_x_Q2 = 1.06176, quant_w_Q1 = 2.73435, quant_w_Q2 = 3.01796, quant_y_Q1 = 0.39779, quant_y_Q2 = 1.54834, quant_z_Q1 = 1.81187, quant_z_Q2 = 3.98947 ), tolerance = 1e-3, ignore_attr = TRUE ) # errors ------------------------------------------------------------------ expect_error( data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = list(quant_xy = c("_Q1", "_Q2", "_Q3")) ), regex = "Names of `suffix` must match the names", fixed = TRUE ) expect_error( data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = list(c("Q1", "Q3"), "mean", c("_Q1", "_Q2", "_Q3")) ), regex = "All elements of `suffix` must have names.", fixed = TRUE ) expect_error( data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = c("_Q1", "_Q2", "_Q3") ), regex = "Argument `suffix` must have the same length", fixed = TRUE ) expect_error( data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = list(quant_x = c("_Q1", "_Q2", "_Q3")) ), regex = "Argument `suffix` must have the same length", fixed = TRUE ) expect_error( data_summary( d, quant_x = quantile(x, c(0.25, 0.75)), mean_x = mean(x), quant_y = quantile(y, c(0.25, 0.5, 0.75)), suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q2")) ), regex = "All suffixes for a single expression must be unique", fixed = TRUE ) expect_error( data_summary( d, quant_x = quantile(x, c(0.25, 0.5)), quant_w = quantile(w, c(0.25, 0.5)), quant_y = quantile(y, c(0.25, 0.5)), quant_z = quantile(z, c(0.25, 0.5)), suffix = c("_Q1", "_Q2", "_Q3") ), regex = "Argument `suffix` must have the same length", fixed = TRUE ) }) ================================================ FILE: tests/testthat/test-data_tabulate.R ================================================ test_that("data_tabulate factor", { data(efc, package = "datawizard") x <- data_tabulate(efc$e42dep) expect_identical( as.vector(x$Value), as.vector(sort(unique( addNA(efc$e42dep) ))) ) expect_identical(x$N, as.vector(table(addNA(efc$e42dep)))) expect_identical( x$`Valid %`, as.vector(c( 100 * table(efc$e42dep) / sum(!is.na(efc$e42dep)), NA )), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_tabulate numeric", { data(efc, package = "datawizard") x <- data_tabulate(efc$neg_c_7) expect_identical( as.vector(x$Value), as.vector(sort(unique( addNA(efc$neg_c_7) ))) ) expect_identical(x$N, as.vector(table(addNA(efc$neg_c_7)))) expect_identical( x$`Valid %`, as.vector(c( 100 * table(efc$neg_c_7) / sum(!is.na(efc$neg_c_7)), NA )), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_tabulate, HTML", { skip_if_not_installed("gt") data(efc, package = "datawizard") expect_s3_class(print_html(data_tabulate(efc$c172code)), "gt_tbl") expect_s3_class(print_html(data_tabulate(efc, "c172code")), "gt_tbl") expect_s3_class( display(data_tabulate(efc, "c172code"), format = "html"), "gt_tbl" ) }) test_that("data_tabulate, tinytable", { skip_if_not_installed("tinytable") data(efc, package = "datawizard") expect_snapshot(display(data_tabulate(efc$c172code), format = "tt")) expect_snapshot(display(data_tabulate(efc, "c172code"), format = "tt")) }) test_that("data_tabulate, weights", { skip_if_not_installed("knitr") data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) # vector/factor out1 <- data_tabulate(efc$e42dep, weights = efc$weights) out2 <- data_tabulate(efc$e42dep) expect_equal(out1$N, c(3, 4, 26, 67, 5), ignore_attr = TRUE) expect_equal(out2$N, c(2L, 4L, 28L, 63L, 3L), ignore_attr = TRUE) expect_equal( out1$N, round(xtabs(efc$weights ~ efc$e42dep, addNA = TRUE)), ignore_attr = TRUE ) # data frames out <- data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights) expect_equal(out[[1]]$N, out1$N, ignore_attr = TRUE) # mismatch of lengths w <- c(efc$weights, 1) expect_error( data_tabulate(efc$e42dep, weights = w), regex = "Length of `weights`" ) # correct table footer expect_snapshot(print(data_tabulate(efc$e42dep, weights = efc$weights))) expect_snapshot(print_md(data_tabulate(efc$e42dep, weights = efc$weights))) expect_snapshot(display(data_tabulate(efc$e42dep, weights = efc$weights))) # correct table caption expect_snapshot(print(data_tabulate( efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$weights ))) expect_snapshot(print_md(data_tabulate( efc, c("e42dep", "e16sex"), weights = efc$weights ))) expect_snapshot(display(data_tabulate( efc, c("e42dep", "e16sex"), weights = efc$weights ))) }) test_that("data_tabulate data.frame", { data(efc, package = "datawizard") x <- data_tabulate(efc, c("e16sex", "c172code")) expect_s3_class(x, "list") expect_length(x, 2L) expect_identical( attributes(x[[1]]), list( names = c( "Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), class = c("datawizard_table", "data.frame"), row.names = 1:3, type = "numeric", varname = "e16sex", label = "elder's gender", object = "e16sex", duplicate_varnames = c(FALSE, TRUE, TRUE), total_n = 100L, valid_n = 100L ) ) expect_identical( attributes(x[[2]]), list( names = c( "Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "numeric", varname = "c172code", label = "carer's level of education", object = "c172code", duplicate_varnames = c(FALSE, TRUE, TRUE, TRUE), total_n = 100L, valid_n = 90L ) ) table1 <- x[[1]] expect_identical( as.vector(table1$Value), as.character(c( sort( unique(efc$e16sex) ), NA )) ) expect_identical(table1$N, as.vector(table(addNA(efc$e16sex)))) expect_identical( table1$`Valid %`, as.vector(c( 100 * table(efc$e16sex) / sum(!is.na(efc$e16sex)), NA )), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_tabulate unsupported class", { data(mtcars) expect_warning( data_tabulate(lm(mpg ~ hp, data = mtcars)), regex = "Can't compute frequency tables" ) }) test_that("data_tabulate print", { set.seed(123) x <- sample.int(3, 1e6, TRUE) out <- data_tabulate(x, name = "Large Number") expect_identical( attributes(out), list( names = c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %"), class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "integer", varname = "Large Number", object = "x", duplicate_varnames = c(FALSE, TRUE, TRUE, TRUE), total_n = 1000000L, valid_n = 1000000L ) ) }) test_that("data_tabulate print", { data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc$e42dep)) }) test_that("data_tabulate print multiple", { data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc, c("c172code", "e16sex"))) }) test_that("data_tabulate big numbers", { set.seed(123) x <- sample.int(5, size = 1e7, TRUE) expect_snapshot(data_tabulate(x)) expect_snapshot(print(data_tabulate(x), big_mark = "-")) expect_snapshot(print(data_tabulate(x), big_mark = "")) }) test_that("data_tabulate print multiple, collapse", { data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc, c("c172code", "e16sex"), collapse = TRUE)) }) test_that("data_tabulate grouped data.frame", { skip_if_not_installed("poorman") data(efc, package = "datawizard") x <- data_tabulate(poorman::group_by(efc, e16sex), "c172code") expect_s3_class(x, "list") expect_length(x, 2L) expect_identical( attributes(x[[1]]), list( names = c( "Variable", "Group", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "numeric", varname = "c172code", label = "carer's level of education", object = "c172code", group_variable = structure( list(e16sex = 1), .drop = TRUE, row.names = 1L, class = "data.frame" ), duplicate_varnames = c(FALSE, TRUE, TRUE, TRUE), total_n = 46L, valid_n = 41L ) ) table1 <- x[[1]] expect_identical( as.vector(table1$Value), as.character(c( sort( unique(efc$c172code) ), NA )) ) expect_identical( table1$N, as.vector(table(addNA(efc$c172code[efc$e16sex == 1]))) ) expect_identical( table1$`Valid %`, as.vector(c( 100 * table(efc$c172code[efc$e16sex == 1]) / sum(!is.na(efc$c172code[efc$e16sex == 1])), NA )), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_tabulate print grouped data", { skip_if_not_installed("poorman") data(efc, package = "datawizard") expect_snapshot(data_tabulate(poorman::group_by(efc, e16sex), "c172code")) }) test_that("data_tabulate print, collapse groups", { skip_if_not_installed("poorman") data(efc, package = "datawizard") expect_snapshot( data_tabulate(poorman::group_by(efc, e16sex), "c172code", collapse = TRUE) ) }) test_that("data_tabulate print, collapse groups, drop levels", { skip_if_not_installed("poorman") data(efc, package = "datawizard") expect_snapshot( data_tabulate( poorman::group_by(efc, e16sex), "e42dep", collapse = TRUE, drop_levels = TRUE ) ) }) test_that("data_tabulate drop levels", { x <- factor(rep(letters[1:3], 3), levels = letters[1:5]) out1 <- data_tabulate(x, drop_levels = FALSE) out2 <- data_tabulate(x, drop_levels = TRUE) expect_identical(out1$N, c(3L, 3L, 3L, 0L, 0L, 0L)) expect_identical(as.character(out1$Value), c("a", "b", "c", "d", "e", NA)) expect_identical(out2$N, c(3L, 3L, 3L, 0L)) expect_identical(as.character(out2$Value), c("a", "b", "c", NA)) }) # select helpers ------------------------------ test_that("data_tabulate regex", { data(mtcars) expect_identical( data_tabulate(mtcars, select = "arb", regex = TRUE), data_tabulate(mtcars, select = "carb") ) }) # missing values ------------------------------ test_that("data_tabulate exclude/include missing values", { data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) efc$e16sex[sample.int(nrow(efc), 5)] <- NA out <- data_tabulate(efc$c172code) expect_identical(out$N, c(8L, 66L, 16L, 10L)) out <- data_tabulate(efc$c172code, remove_na = TRUE) expect_identical(out$N, c(8L, 66L, 16L)) out <- data_tabulate(efc$c172code, weights = efc$weights) expect_identical(out$N, c(10, 67, 15, 13)) out <- data_tabulate(efc$c172code, remove_na = TRUE, weights = efc$weights) expect_identical(out$N, c(10, 67, 15)) }) # cross tables ------------------------------ test_that("data_tabulate, cross tables", { data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(print(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full" ))) expect_snapshot(print(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE ))) expect_snapshot(print(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights ))) expect_snapshot(print(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights ))) # nolint expect_snapshot(print(data_tabulate( efc, "c172code", by = efc$e16sex, proportions = "row" ))) expect_snapshot(print(data_tabulate( efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE ))) expect_snapshot(print(data_tabulate( efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights ))) expect_snapshot(print(data_tabulate( efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights ))) # nolint expect_snapshot(print(data_tabulate( efc, "c172code", by = "e16sex", proportions = "column" ))) expect_snapshot(print(data_tabulate( efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE ))) expect_snapshot(print(data_tabulate( efc, "c172code", by = "e16sex", proportions = "column", weights = "weights" ))) expect_snapshot(print(data_tabulate( efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE, weights = "weights" ))) # nolint expect_snapshot(print(data_tabulate( efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row" ))) # nolint }) test_that("data_tabulate, cross tables, HTML", { skip_if_not_installed("gt") data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_s3_class( print_html(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full" )), "gt_tbl" ) expect_s3_class( print_html(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE )), "gt_tbl" ) # nolint expect_s3_class( print_html(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights )), "gt_tbl" ) # nolint expect_s3_class( print_html(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights )), "gt_tbl" ) # nolint expect_s3_class( print_html(data_tabulate( efc, "c172code", by = efc$e16sex, proportions = "row" )), "gt_tbl" ) expect_s3_class( print_html(data_tabulate( efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights )), "gt_tbl" ) # nolint expect_s3_class( display( data_tabulate( efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights ), format = "html" ), "gt_tbl" ) # nolint }) test_that("data_tabulate, cross tables, tinytable", { skip_if_not_installed("tinytable") data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(display( data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"), format = "tt" )) expect_snapshot(display( data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE ), format = "tt" )) expect_snapshot(display( data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights ), format = "tt" )) expect_snapshot(display( data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights ), format = "tt" )) expect_snapshot(display( data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"), format = "tt" )) expect_snapshot(display( data_tabulate( efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights ), format = "tt" )) }) test_that("data_tabulate, cross tables, grouped df", { data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) efc$e16sex[sample.int(nrow(efc), 5)] <- NA grp <- data_group(efc, "e42dep") expect_snapshot(print(data_tabulate( grp, "c172code", by = "e16sex", proportions = "row" ))) skip_if_not_installed("gt") expect_s3_class( print_html(data_tabulate( grp, "c172code", by = "e16sex", proportions = "row" )), "gt_tbl" ) # nolint expect_s3_class( print_html(data_tabulate( efc, c("e16sex", "e42dep"), by = "c172code", proportions = "row" )), "gt_tbl" ) # nolint }) test_that("data_tabulate, cross tables, print/format works", { data(mtcars) x <- data_tabulate(mtcars, c("cyl", "am"), by = "gear") expect_snapshot(print(x)) }) test_that("data_tabulate, cross tables, errors by", { data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_error( data_tabulate(efc$c172code, by = "e16sex"), regex = "If `by` is a string" ) expect_error( data_tabulate(efc$c172code, by = efc$e16sex[-1]), regex = "Length of `by`" ) expect_error( data_tabulate(efc, "c172code", by = efc$e16sex[-1]), regex = "Length of `by`" ) expect_error( data_tabulate(efc, "c172code", by = "c16sex"), regex = "not found" ) expect_error( data_tabulate(efc, "c172code", by = c("e16sex", "e42dep")), regex = "You may use" ) }) test_that("data_tabulate, cross tables, errors weights", { data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_error( data_tabulate(efc$c172code, weights = "weights"), regex = "If `weights`" ) expect_error( data_tabulate(efc$c172code, weights = efc$weights[-1]), regex = "Length of `weights`" ) expect_error( data_tabulate(efc, "c172code", weights = efc$weights[-1]), regex = "Length of `weights`" ) expect_error( data_tabulate(efc, "c172code", weights = "weigths"), regex = "not found" ) expect_error( data_tabulate(efc, "c172code", weights = c("e16sex", "e42dep")), regex = "length 1" ) expect_error( data_tabulate(efc$c172code, weights = efc$wweight), regex = "not found" ) }) test_that("data_tabulate, cross tables, modify structure", { skip_if_not_installed("knitr") data(efc, package = "datawizard") x <- data_group(efc, c("c172code", "e16sex")) out <- data_tabulate(x, "c172code") out[] <- lapply( out, data_select, exclude = c("Variable", "Raw %", "Cumulative %") ) junk <- capture.output(print_md(out)) expect_false(grepl("Variable", junk[3], fixed = TRUE)) expect_false(grepl("Raw %", junk[3], fixed = TRUE)) # display() default to markdown junk <- capture.output(display(out)) expect_false(grepl("Variable", junk[3], fixed = TRUE)) expect_false(grepl("Raw %", junk[3], fixed = TRUE)) }) # markdown ------------------------- test_that("data_tabulate, cross tables, markdown", { skip_if_not_installed("knitr") data(efc, package = "datawizard") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(print_md(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full" ))) expect_snapshot(print_md(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE ))) expect_snapshot(print_md(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights ))) expect_snapshot(print_md(data_tabulate( efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights ))) # nolint expect_snapshot(print_md(data_tabulate( efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE, weights = "weights" ))) # nolint expect_snapshot(print_md(data_tabulate( efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row" ))) # nolint expect_snapshot(display(data_tabulate( efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE, weights = "weights" ))) # nolint expect_snapshot(display(data_tabulate( efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row" ))) # nolint }) # validate against table ------------------------- test_that("data_tabulate, validate against table", { data(mtcars) # frequency table out1 <- as.data.frame(table(mtcars$cyl)) out2 <- data_tabulate(mtcars$cyl, remove_na = TRUE) expect_identical(out1$Freq, out2$N) # crosstable out1 <- data_arrange( as.data.frame(table(mtcars$cyl, mtcars$gear)), c("Var1", "Var2") ) out2 <- data_rename( data_to_long( as.data.frame(data_tabulate( mtcars$cyl, by = mtcars$gear, remove_na = TRUE )), 2:4, names_to = "Var2", values_to = "Freq" ), "mtcars$cyl", "Var1" ) out1[[2]] <- as.character(out1[[2]]) expect_equal(out1, out2, ignore_attr = TRUE) }) test_that("data_tabulate, correct 0% for proportions", { data(efc, package = "datawizard") out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") expect_identical( format(out[[1]])[[4]], c("0 (0.0%)", "0 (0.0%)", "0 (0.0%)", "0 (0.0%)", "", "0") ) expect_snapshot(print(out[[1]])) }) # coercing to data frame ------------------------- test_that("data_tabulate, as.data.frame, frequency tables", { data(mtcars) # frequency table x <- data_tabulate(mtcars$cyl) out <- as.data.frame(x) expect_named( out, c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %") ) expect_identical( out$Variable, c("mtcars$cyl", "mtcars$cyl", "mtcars$cyl", "mtcars$cyl") ) expect_false(any(vapply(out[2:ncol(out)], is.character, logical(1)))) # frequency tables x <- data_tabulate(mtcars, select = c("cyl", "am")) out <- as.data.frame(x) expect_named(out, c("var", "table")) expect_equal( vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE ) expect_length(out$table, 2L) expect_named( out$table[[1]], c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %") ) expect_identical(out$table[[1]]$Variable, c("cyl", "cyl", "cyl", "cyl")) expect_false(any(vapply( out$table[[1]][2:ncol(out$table[[1]])], is.character, logical(1) ))) }) test_that("data_tabulate, as.data.frame, cross tables", { data(mtcars) # cross table x <- data_tabulate(mtcars, "cyl", by = "am") out <- as.data.frame(x) expect_named(out, c("var", "table")) expect_equal( vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE ) expect_length(out$table, 1L) expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) expect_identical(nrow(out$table[[1]]), 4L) # cross tables x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") out <- as.data.frame(x) expect_named(out, c("var", "table")) expect_equal( vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE ) expect_length(out$table, 2L) expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) expect_identical(nrow(out$table[[1]]), 4L) }) test_that("data_tabulate, as.data.frame, cross tables with total N", { # cross table, with total x <- data_tabulate(mtcars, "cyl", by = "am") out <- as.data.frame(x, add_total = TRUE) expect_named(out, c("var", "table")) expect_equal( vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE ) expect_length(out$table, 1L) expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) expect_identical(nrow(out$table[[1]]), 5L) expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) # cross tables, with total x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") out <- as.data.frame(x, add_total = TRUE) expect_named(out, c("var", "table")) expect_equal( vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE ) expect_length(out$table, 2L) expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) expect_identical(nrow(out$table[[1]]), 5L) expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) }) # table methods ----------------------------- test_that("data_tabulate, table methods", { data(mtcars) # datawizard_table x <- data_tabulate(mtcars$cyl) expect_type(as.table(x), "list") expect_s3_class(as.table(x, simplify = TRUE), "table") expect_snapshot(as.table(x)) # datawizard_tables x <- data_tabulate(mtcars, "cyl") expect_type(as.table(x), "list") expect_s3_class(as.table(x, simplify = TRUE), "table") expect_snapshot(as.table(x)) # test remove_na x <- data_tabulate(mtcars, "cyl", remove_na = TRUE) expect_identical(x[[1]]$N, as.vector(as.table(x, simplify = TRUE))) x <- data_tabulate(mtcars, "cyl") expect_identical( x[[1]]$N, as.vector(as.table(x, simplify = TRUE, remove_na = FALSE)) ) expect_snapshot(as.table(x, remove_na = FALSE)) # datawizard_tables, multiple x <- data_tabulate(mtcars, c("cyl", "gear")) expect_identical(unlist(lapply(as.table(x), class)), rep("table", 2L)) expect_type(as.table(x, simplify = TRUE), "list") # no simplification expect_type(as.table(x, simplify = FALSE), "list") expect_snapshot(as.table(x)) # datawizard_crosstab x <- data_tabulate(mtcars$cyl, mtcars$gear) expect_type(as.table(x), "list") expect_s3_class(as.table(x, simplify = TRUE), "table") expect_snapshot(as.table(x)) expect_snapshot(as.table(x, simplify = TRUE)) # datawizard_crosstabs x <- data_tabulate(mtcars, "cyl", by = "gear") expect_type(as.table(x), "list") expect_s3_class(as.table(x, simplify = TRUE), "table") expect_snapshot(as.table(x)) expect_snapshot(as.table(x, simplify = TRUE)) # datawizard_crosstabs, multiple x <- data_tabulate(mtcars, c("am", "cyl"), by = "gear") expect_identical(unlist(lapply(as.table(x), class)), rep("table", 2L)) expect_identical( x[[1]]$`3`[1:2], as.vector(as.table(x)[[1]][, 1, drop = TRUE]) ) expect_identical( x[[2]]$`4`[1:3], as.vector(as.table(x)[[2]][, 2, drop = TRUE]) ) expect_type(as.table(x), "list") expect_type(as.table(x, simplify = TRUE), "list") # no simplification expect_snapshot(as.table(x)) # grouped data frames d <- data_group(mtcars, "am") x <- data_tabulate(d, "cyl", by = "gear") expect_named(as.table(x), c("am (0)", "am (1)")) expect_snapshot(as.table(x)) # messages - no missings to remove expect_silent(as.table(data_tabulate(mtcars, "cyl"))) expect_silent(as.table(data_tabulate(mtcars, "cyl"), verbose = FALSE)) }) test_that("data_tabulate, table methods, only warn if necessary", { # missings data(efc) # single variable expect_message(as.table(data_tabulate(efc$c172code))) expect_silent(as.table(data_tabulate(efc$c172code, remove_na = TRUE))) expect_silent(as.table(data_tabulate(efc$c172code), remove_na = FALSE)) expect_silent(as.table(data_tabulate(efc$c172code), verbose = FALSE)) # cross table expect_message( as.table(data_tabulate(efc, "c172code", by = "e42dep")), regex = "Removing NA values" ) expect_silent(as.table(data_tabulate( efc, "c172code", by = "e42dep", remove_na = TRUE ))) expect_silent(as.table( data_tabulate(efc, "c172code", by = "e42dep"), remove_na = FALSE )) expect_silent(as.table( data_tabulate(efc, "c172code", by = "e42dep"), verbose = FALSE )) # no missings data(mtcars) # single variable expect_silent(as.table(data_tabulate(mtcars$gear))) expect_silent(as.table(data_tabulate(mtcars$gear, remove_na = TRUE))) expect_silent(as.table(data_tabulate(mtcars$gear), verbose = FALSE)) # cross table expect_silent(as.table(data_tabulate(mtcars, "gear", by = "cyl"))) expect_silent(as.table(data_tabulate( mtcars, "gear", by = "cyl", remove_na = TRUE ))) expect_silent(as.table( data_tabulate(mtcars, "gear", by = "cyl"), verbose = FALSE )) # group DF throws no warning d <- data_group(mtcars, "am") expect_silent(as.table(data_tabulate(d, "cyl", by = "gear"))) }) test_that("data_tabulate, cross tables, extract proportions", { data(efc, package = "datawizard") out <- data_tabulate( efc, "c172code", by = "e16sex", proportions = "row", remove_na = TRUE ) tab <- table(efc$c172code, efc$e16sex) / rowSums(table(efc$c172code, efc$e16sex)) dimnames(tab) <- list(c("1", "2", "3"), c("male", "female")) expect_equal( as.prop.table(out, verbose = FALSE), list(tab), ignore_attr = TRUE, tolerance = 1e-4 ) expect_equal( as.prop.table(out, verbose = FALSE, simplify = TRUE), tab, ignore_attr = TRUE, tolerance = 1e-4 ) out <- data_tabulate( efc, "c172code", by = "e16sex", proportions = "col", remove_na = TRUE ) tab <- as.table(t( t(table(efc$c172code, efc$e16sex)) / colSums(table(efc$c172code, efc$e16sex)) )) dimnames(tab) <- list(c("1", "2", "3"), c("male", "female")) expect_equal( as.prop.table(out, verbose = FALSE), list(tab), ignore_attr = TRUE, tolerance = 1e-4 ) expect_equal( as.prop.table(out, verbose = FALSE, simplify = TRUE), tab, ignore_attr = TRUE, tolerance = 1e-4 ) }) ================================================ FILE: tests/testthat/test-data_to_factor.R ================================================ # numeric test_that("to_factor", { x <- c(10, 11, 12) expect_identical( to_factor(x), structure(1:3, .Label = c("10", "11", "12"), class = "factor") ) data(efc) x <- to_factor(efc$c172code) expect_identical( levels(x), c( "low level of education", "intermediate level of education", "high level of education" ) ) x <- to_factor(efc$c172code, labels_to_levels = FALSE) expect_identical(levels(x), c("1", "2", "3")) }) # numeric, partially labelled test_that("to_factor", { x <- c(10, 11, 12) attr(x, "labels") <- c(ten = 10, twelve = 12) expect_message( expect_identical( to_factor(x), structure(1:3, levels = c("ten", "11", "twelve"), class = "factor") ), regexp = "Not all factor levels" ) }) # factor test_that("to_factor", { data(efc) expect_identical(to_factor(efc$e42dep), efc$e42dep) }) # data frame test_that("to_factor", { data(iris) out <- to_factor(iris) expect_identical(out$Species, iris$Species) expect_true(all(vapply(out, is.factor, TRUE))) expect_identical( levels(out$Sepal.Length), c( "4.3", "4.4", "4.5", "4.6", "4.7", "4.8", "4.9", "5", "5.1", "5.2", "5.3", "5.4", "5.5", "5.6", "5.7", "5.8", "5.9", "6", "6.1", "6.2", "6.3", "6.4", "6.5", "6.6", "6.7", "6.8", "6.9", "7", "7.1", "7.2", "7.3", "7.4", "7.6", "7.7", "7.9" ) ) out <- to_factor(iris, select = starts_with("Sep"), append = TRUE) expect_identical( colnames(out), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "Sepal.Length_f", "Sepal.Width_f" ) ) expect_identical(sum(vapply(out, is.factor, TRUE)), 3L) }) # select helpers ------------------------------ test_that("to_factor regex", { expect_identical( to_factor(mtcars, select = "yl", regex = TRUE), to_factor(mtcars, select = "cyl") ) expect_identical( to_factor(mtcars, select = "yl$", regex = TRUE), to_factor(mtcars, select = "cyl") ) }) # SPSS file, many value labels ----------------------------------- skip_if_not_installed("httr") skip_if_not_installed("haven") skip_on_cran() skip_if_not_installed("curl") skip_if_offline() # Output validated against SPSS output from original dataset test_that("data_read, convert many labels correctly", { temp_file <- tempfile(fileext = ".sav") request <- httr::GET( "https://raw.github.com/easystats/circus/master/data/spss_many_labels.sav" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- data_read( temp_file, convert_factors = FALSE, verbose = FALSE ) expect_identical( levels(to_factor(d$selv1)), c( "Vignette 1 weiblich (Gülsen E. Reinigungskraft B)", "Vignette 2 weiblich (Gülsen E. Anwältin B)", "Vignette 3 weiblich (Monika E. Reinigungskraft B)", "Vignette 4 weiblich (Monika E. Anwältin B)", "Vignette 5 männlich (Hasan E. Reinigungskraft B)", "Vignette 6 männlich (Hasan E. Anwalt B)", "Vignette 7 männlich (Martin E. Reinigungskraft B)", "Vignette 8 männlich (Martin E. Anwalt B)", "Vignette 9 weiblich (Gülsen E. Reinigungskraft E)", "Vignette 10 weiblich (Gülsen E. Anwältin E)", "Vignette 11 weiblich (Monika E. Reinigungskraft E)", "Vignette 12 weiblich (Monika E. Anwältin E)", "Vignette 13 männlich (Hasan E. Reinigungskraft E)", "Vignette 14 männlich (Hasan E. Anwalt E)", "Vignette 15 männlich (Martin E. Reinigungskraft E)", "Vignette 16 männlich (Martin E. Anwalt E)" ) ) expect_snapshot(data_tabulate(to_factor(d$selv1))) expect_identical(levels(to_factor(d$c12)), c("ja", "nein", "keine Angabe")) expect_snapshot(data_tabulate(to_factor(d$c12))) expect_identical( levels(to_factor(d$c12a)), c("Filter", "ja", "nein", "keine Angabe") ) expect_snapshot(data_tabulate(to_factor(d$c12a))) expect_identical( levels(to_factor(d$c12c)), c( "Filter", "0 = keine", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10 = sehr starke", "weiß nicht / keine Angabe" ) ) expect_snapshot(data_tabulate(to_factor(d$c12c))) unlink(temp_file) }) test_that("to_factor works with haven_labelled, convert many labels correctly", { skip_if_not_installed("withr") withr::with_tempfile("temp_file", fileext = ".sav", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/EFC.sav" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- haven::read_spss(temp_file) x <- to_factor(d$c172code) expect_identical( levels(x), c( "low level of education", "intermediate level of education", "high level of education" ) ) }) }) ================================================ FILE: tests/testthat/test-data_to_long.R ================================================ set.seed(123) wide_data <- data.frame(replicate(3, sample.int(5))) test_that("data_to_long works", { expect_equal( head(data_to_long(wide_data)), data.frame( Name = c("X1", "X2", "X3", "X1", "X2", "X3"), Value = c(3L, 3L, 2L, 2L, 1L, 3L), stringsAsFactors = FALSE ), ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( head(data_to_long( wide_data, select = c(1, 2), names_to = "Column", values_to = "Numbers", rows_to = "Row" )), data.frame( X3 = c(2L, 2L, 3L, 3L, 1L, 1L), Row = c(1, 1, 2, 2, 3, 3), Column = c("X1", "X2", "X1", "X2", "X1", "X2"), Numbers = c(3L, 3L, 2L, 1L, 5L, 2L), stringsAsFactors = FALSE ), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_to_long works - using row names as idvar", { data(mtcars) out <- data_to_long(mtcars, select = 2:4) expect_equal( dim(out), c(96, 10), ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( colnames(out), c("mpg", "drat", "wt", "qsec", "vs", "am", "gear", "carb", "name", "value"), ignore_attr = TRUE, tolerance = 1e-3 ) expect_equal( head(out$value), c(8, 304, 150, 8, 472, 205), ignore_attr = TRUE, tolerance = 1e-3 ) }) test_that("data_to_long works - complex dataset", { skip_if_not_installed("psych") data <- psych::bfi long <- data_to_long( data, select = regex("\\d"), names_to = "Item", values_to = "Score", rows_to = "Participant" ) expect_snapshot(str(long)) long$Facet <- gsub("\\d", "", long$Item) long$Item <- gsub("[A-Z]", "", long$Item) long$Item <- paste0("I", long$Item) long1 <- data_to_long( data, select = starts_with("A"), names_to = "Item", values_to = "Score", rows_to = "Participant" ) expect_identical(unique(long1$Item), c("A1", "A2", "A3", "A4", "A5")) expect_identical(unique(long1$Score), c(2L, 4L, 3L, 5L, 6L, 1L, NA)) expect_identical(ncol(long1), 26L) expect_identical(nrow(long1), 14000L) long1 <- data_to_long( data, select = starts_with("a"), names_to = "Item", values_to = "Score", rows_to = "Participant" ) expect_identical(ncol(long1), 30L) expect_identical(nrow(long1), nrow(data)) long1 <- data_to_long( data, select = starts_with("a"), names_to = "Item", values_to = "Score", rows_to = "Participant", ignore_case = TRUE ) expect_identical(unique(long1$Item), c("A1", "A2", "A3", "A4", "A5", "age")) expect_identical(ncol(long1), 25L) expect_identical(nrow(long1), 16800L) long1 <- data_to_long( data, select = c(1:5, 28), names_to = "Item", values_to = "Score", rows_to = "Participant", ignore_case = TRUE ) expect_identical(unique(long1$Item), c("A1", "A2", "A3", "A4", "A5", "age")) expect_identical(ncol(long1), 25L) expect_identical(nrow(long1), 16800L) }) test_that("data_to_long: arg 'cols' overrides 'select'", { skip_if_not_installed("psych") data <- psych::bfi expect_identical( data_to_long( wide_data, select = c(1, 2), names_to = "Column", values_to = "Numbers", rows_to = "Row" ), data_to_long( wide_data, cols = c(1, 2), names_to = "Column", values_to = "Numbers", rows_to = "Row" ) ) expect_identical( data_to_long( data, cols = regex("\\d"), names_to = "Item", values_to = "Score", rows_to = "Participant" ), data_to_long( data, select = regex("\\d"), names_to = "Item", values_to = "Score", rows_to = "Participant" ) ) expect_identical( data_to_long( data, cols = starts_with("A"), names_to = "Item", values_to = "Score", rows_to = "Participant" ), data_to_long( data, select = starts_with("A"), names_to = "Item", values_to = "Score", rows_to = "Participant" ) ) }) d <- data.frame( age = c(20, 30, 40), sex = c("Female", "Male", "Male"), score_t1 = c(30, 35, 32), score_t2 = c(33, 34, 37), speed_t1 = c(2, 3, 1), speed_t2 = c(3, 4, 5), stringsAsFactors = FALSE ) test_that("data_to_long works as expected - simple dataset", { out <- data_to_long(d, starts_with("score")) expect_identical( out$name, c("score_t1", "score_t2", "score_t1", "score_t2", "score_t1", "score_t2") ) expect_identical( out$value, c(d$score_t1, d$score_t2)[c(1, 4, 2, 5, 3, 6)] ) out <- data_to_long( d, contains("t2"), names_to = "NewCol", values_to = "Time" ) expect_identical( out$NewCol, c("score_t2", "speed_t2", "score_t2", "speed_t2", "score_t2", "speed_t2") ) expect_identical(out$Time, c(33, 3, 34, 4, 37, 5)) }) test_that("data_to_long works as expected - select-helper inside functions, using regex", { test_fun <- function(data, i) { data_to_long(data, select = i, regex = TRUE) } out <- test_fun(d, "^score") expect_identical( out$name, c("score_t1", "score_t2", "score_t1", "score_t2", "score_t1", "score_t2") ) expect_identical( out$value, c(d$score_t1, d$score_t2)[c(1, 4, 2, 5, 3, 6)] ) }) test_that("data_to_long: need to provide sep or pattern if several names_to", { expect_error( data_to_long(wide_data, names_to = c("foo", "foo2")), "you supply multiple names" ) }) test_that("data_to_long: can't use sep or pattern if only one names_to", { expect_error( data_to_long(wide_data, names_to = "foo", names_sep = "_"), "can't use `names_sep`" ) expect_error( data_to_long(wide_data, names_to = "foo", names_pattern = "_"), "can't use `names_pattern`" ) }) test_that("data_to_long: error if no columns to reshape", { # since #602, we no longer have the case that .select_nse() returns no # columns, because we error before when no column found, instead of returning # NULL or a vector of lenght zero. expect_error( data_to_long(wide_data, cols = "foo"), "Possibly misspelled" ) }) # EQUIVALENCE WITH TIDYR - PIVOT_LONGER ------------------------------------------- # Examples coming from: https://tidyr.tidyverse.org/articles/pivot.html#longer test_that("data_to_long equivalent to pivot_longer: ex 1", { skip_if_not_installed("tidyr") x <- tidyr::relig_income %>% # nolint tidyr::pivot_longer(!religion, names_to = "income", values_to = "count") y <- tidyr::relig_income %>% # nolint data_to_long(cols = -religion, names_to = "income", values_to = "count") expect_equal(x, y, ignore_attr = TRUE) }) test_that("data_to_long equivalent to pivot_longer: ex 2", { skip_if_not_installed("tidyr") x <- tidyr::billboard %>% tidyr::pivot_longer( cols = starts_with("wk"), names_to = "week", values_to = "rank" ) y <- tidyr::billboard %>% data_to_long( cols = starts_with("wk"), names_to = "week", values_to = "rank" ) expect_equal(x, y, ignore_attr = TRUE) }) test_that("data_to_long equivalent to pivot_longer: ex 3", { skip_if_not_installed("tidyr") x <- tidyr::billboard %>% tidyr::pivot_longer( cols = starts_with("wk"), names_to = "week", values_to = "rank", values_drop_na = TRUE ) y <- tidyr::billboard %>% data_to_long( cols = starts_with("wk"), names_to = "week", values_to = "rank", values_drop_na = TRUE ) expect_equal(x, y, ignore_attr = TRUE) }) test_that("data_to_long equivalent to pivot_longer: ex 4", { skip_if_not_installed("tidyr") x <- tidyr::billboard %>% tidyr::pivot_longer( cols = starts_with("wk"), names_to = "week", names_prefix = "wk", values_to = "rank", values_drop_na = TRUE ) y <- tidyr::billboard %>% data_to_long( select = starts_with("wk"), names_to = "week", names_prefix = "wk", values_to = "rank", values_drop_na = TRUE ) expect_equal(x, y, ignore_attr = TRUE) }) test_that("data_to_long equivalent to pivot_longer: ex 5", { skip_if_not_installed("tidyr") suppressWarnings({ x <- tidyr::who %>% tidyr::pivot_longer( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_sep = "_", values_to = "count" ) }) y <- tidyr::who %>% data_to_long( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_sep = "_", values_to = "count" ) expect_equal(x, y, ignore_attr = TRUE) }) test_that("data_to_long equivalent to pivot_longer: ex 6", { skip_if_not_installed("tidyr") x <- tidyr::who %>% tidyr::pivot_longer( cols = new_sp_m014:newrel_f65, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ) y <- tidyr::who %>% data_to_long( cols = 5:60, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ) expect_equal(x, y, ignore_attr = TRUE) }) # tests coming from tidyr's repo # https://github.com/tidyverse/tidyr/blob/main/tests/testthat/test-pivot-long.R test_that("can reshape all cols to long", { skip_if_not_installed("tidyr") df <- tidyr::tibble(x = 1:2, y = 3:4) pv <- data_to_long(df, x:y) expect_named(pv, c("name", "value")) expect_identical(pv$name, rep(names(df), 2)) expect_identical(pv$value, c(1L, 3L, 2L, 4L)) }) test_that("values interleaved correctly", { skip_if_not_installed("tidyr") df <- tidyr::tibble( x = c(1, 2), y = c(10, 20), z = c(100, 200) ) pv <- data_to_long(df, 1:3) expect_identical(pv$value, c(1, 10, 100, 2, 20, 200)) }) test_that("preserves original keys", { skip_if_not_installed("tidyr") df <- tidyr::tibble(x = 1:2, y = 2, z = 1:2) pv <- data_to_long(df, y:z) expect_named(pv, c("x", "name", "value")) expect_identical(pv$x, rep(df$x, each = 2)) }) test_that("can drop missing values", { skip_if_not_installed("tidyr") df <- data.frame(x = c(1, NA), y = c(NA, 2)) pv <- data_to_long(df, x:y, values_drop_na = TRUE) expect_identical(pv$name, c("x", "y")) expect_identical(pv$value, c(1, 2)) }) test_that("mixed columns are automatically coerced", { skip_if_not_installed("tidyr") df <- data.frame(x = factor("a"), y = factor("b")) pv <- data_to_long(df, x:y) expect_identical(pv$value, factor(c("a", "b"))) }) test_that("error when overwriting existing column", { skip_if_not_installed("tidyr") df <- tidyr::tibble(x = 1, y = 2) expect_error( data_to_long(df, y, names_to = "x"), regexp = "are already present" ) }) test_that("preserve date format", { skip_if_not_installed("tidyr") family <- tidyr::tibble( family = 1:3, dob_child1 = as.Date(c("1998-11-26", "2004-10-10", "2000-12-05")), dob_child2 = as.Date(c("2000-01-29", NA, "2004-04-05")) ) tidyr <- tidyr::pivot_longer(family, !family, names_to = "child") datawiz <- data_to_long(family, -family, names_to = "child") expect_identical(tidyr, datawiz) }) test_that("works with labelled data", { data(efc, package = "datawizard") out <- data_to_long( efc, select = c("e16sex", "c172code"), names_to = "Dummy", values_to = "Score" ) expect_identical(nrow(out), 200L) expect_identical(attributes(out$e42dep)$label, "elder's dependency") }) test_that("don't convert factors to integer", { data("mtcars") mtcars <- mtcars[c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 18L, 29L, 31L), ] mtcars$am_f <- factor(mtcars$am) mtcars$cyl_f <- factor(mtcars$cyl) mtcars$id <- factor(seq_len(nrow(mtcars))) mtcars_long <- data_to_long( mtcars, select = c("mpg", "qsec", "disp"), names_to = "g" ) expect_snapshot(print(mtcars_long)) }) test_that("tell user about typos", { data("mtcars") expect_silent(data_to_long( mtcars, select = c("mpg", "hp", "disp"), names_to = "time", values_to = "count" )) expect_error( data_to_long( mtcars, select = c("mpg", "ho", "dist"), names_to = "time", values_to = "count" ), regex = "Following" ) }) ================================================ FILE: tests/testthat/test-data_to_numeric.R ================================================ test_that("convert data frame to numeric", { expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = TRUE)) expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = FALSE)) }) test_that("convert character to numeric", { expect_identical(to_numeric(c("xyz", "ab")), c(2, 1)) }) test_that("convert character to numeric Date", { expect_warning(expect_identical( to_numeric(as.Date("2022-01-01")), as.numeric(as.Date("2022-01-01")) )) expect_warning(expect_identical( to_numeric(as.POSIXct("2022-01-01")), as.numeric(as.POSIXct("2022-01-01")) )) expect_warning(expect_identical( to_numeric(as.POSIXlt("2022-01-01")), as.numeric(as.POSIXlt("2022-01-01")) )) }) test_that("convert character to numeric preserve levels", { x <- head(as.factor(mtcars$gear)) expect_identical( to_numeric(x, dummy_factors = FALSE), c(2, 2, 2, 1, 1, 1) ) expect_identical( to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE), c(4, 4, 4, 3, 3, 3) ) }) test_that("convert character to numeric lowest", { d <- head(mtcars) d$vs <- as.factor(d$vs) model <- glm(vs ~ wt + mpg, data = d, family = "binomial") expect_identical( to_numeric(insight::get_response(model), dummy_factors = FALSE), c(1, 1, 2, 2, 1, 2) ) expect_identical( to_numeric(insight::get_response(model), dummy_factors = FALSE, lowest = 0), c(0, 0, 1, 1, 0, 1) ) }) test_that("convert factor to numeric", { f <- factor(substring("statistics", 1:10, 1:10)) expect_snapshot(to_numeric(f, dummy_factors = TRUE)) }) test_that("convert factor to numeric", { expect_identical(to_numeric(c("abc", "xyz")), c(1, 2)) expect_identical(to_numeric(c("123", "789")), c(123, 789)) expect_identical(to_numeric(c("1L", "2e-3")), c(1, 0.002)) expect_identical(to_numeric(c("1L", "2e-3", "ABC")), c(1, 2, 3)) }) test_that("convert factor to numeric, dummy factors", { expect_identical( to_numeric(c("abc", "xyz"), dummy_factors = TRUE), data.frame(abc = c(1, 0), xyz = c(0, 1)), ignore_attr = TRUE ) expect_identical( to_numeric(c("1L", "2e-3", "ABC"), dummy_factors = TRUE), data.frame(`1L` = c(1, 0, 0), `2e-3` = c(0, 1, 0), ABC = c(0, 0, 1)), ignore_attr = TRUE ) }) test_that("convert factor to numeric, append", { data(efc) expect_identical( colnames(to_numeric(efc, dummy_factors = TRUE)), c( "c12hour", "e16sex", "e42dep.1", "e42dep.2", "e42dep.3", "e42dep.4", "c172code", "neg_c_7" ), ignore_attr = TRUE ) expect_identical( colnames(to_numeric(efc, dummy_factors = TRUE, append = TRUE)), c( "c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_n", "e42dep_n.1", "e42dep_n.2", "e42dep_n.3", "e42dep_n.4" ), ignore_attr = TRUE ) expect_identical( colnames(to_numeric(efc, append = TRUE, dummy_factors = FALSE)), c("c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_n"), ignore_attr = TRUE ) expect_identical( colnames(to_numeric(efc, append = FALSE, dummy_factors = FALSE)), c("c12hour", "e16sex", "e42dep", "c172code", "neg_c_7"), ignore_attr = TRUE ) }) test_that("convert factor to numeric, all numeric", { data(mtcars) expect_identical(to_numeric(mtcars), mtcars) }) test_that("convert factor to numeric, dummy factors, with NA", { x1 <- factor(rep(c("a", "b"), 3)) x2 <- factor(c("a", NA_character_, "a", "b", "a", "b")) x3 <- factor(c(NA_character_, "b", "a", "b", "a", "b")) x4 <- factor(c("a", "b", "a", "b", "a", NA_character_)) x5 <- factor(c(NA_character_, "b", "a", "b", "a", NA_character_)) x6 <- factor(c(NA_character_, "b", NA_character_, "b", "a", NA_character_)) x7 <- factor(c( NA_character_, "b", "a", "b", "a", "b", NA_character_, "b", "a", NA_character_, "a", "b", "a", "b", "a", NA_character_ )) # same observations are missing expect_identical( which(!complete.cases(to_numeric(x1, dummy_factors = TRUE))), which(is.na(x1)) ) expect_identical( which(!complete.cases(to_numeric(x2, dummy_factors = TRUE))), which(is.na(x2)) ) expect_identical( which(!complete.cases(to_numeric(x3, dummy_factors = TRUE))), which(is.na(x3)) ) expect_identical( which(!complete.cases(to_numeric(x4, dummy_factors = TRUE))), which(is.na(x4)) ) expect_identical( which(!complete.cases(to_numeric(x5, dummy_factors = TRUE))), which(is.na(x5)) ) expect_identical( which(!complete.cases(to_numeric(x6, dummy_factors = TRUE))), which(is.na(x6)) ) expect_identical( which(!complete.cases(to_numeric(x7, dummy_factors = TRUE))), which(is.na(x7)) ) # output has same number of observation as input expect_identical(nrow(to_numeric(x1, dummy_factors = TRUE)), length(x1)) expect_identical(nrow(to_numeric(x2, dummy_factors = TRUE)), length(x2)) expect_identical(nrow(to_numeric(x3, dummy_factors = TRUE)), length(x3)) expect_identical(nrow(to_numeric(x4, dummy_factors = TRUE)), length(x4)) expect_identical(nrow(to_numeric(x5, dummy_factors = TRUE)), length(x5)) expect_identical(nrow(to_numeric(x6, dummy_factors = TRUE)), length(x6)) expect_identical(nrow(to_numeric(x7, dummy_factors = TRUE)), length(x7)) }) test_that("to_numeric, inverse factor levels", { f <- c(0, 0, 1, 1, 1, 0) x1 <- factor(f, levels = c(0, 1)) x2 <- factor(f, levels = c(1, 0)) out <- to_numeric(x1, dummy_factors = FALSE, preserve_levels = FALSE) expect_identical(out, c(1, 1, 2, 2, 2, 1)) out <- to_numeric(x2, dummy_factors = FALSE, preserve_levels = FALSE) expect_identical(out, c(2, 2, 1, 1, 1, 2)) out <- to_numeric(x1, dummy_factors = FALSE, preserve_levels = TRUE) expect_identical(out, c(0, 0, 1, 1, 1, 0)) out <- to_numeric(x2, dummy_factors = FALSE, preserve_levels = TRUE) expect_identical(out, c(1, 1, 0, 0, 0, 1)) }) # select helpers ------------------------------ test_that("to_numeric regex", { expect_identical( to_numeric(mtcars, select = "pg", regex = TRUE), to_numeric(mtcars, select = "mpg") ) }) test_that("to_numeric works with haven_labelled, convert many labels correctly", { skip_on_cran() skip_if_not_installed("httr") skip_if_not_installed("haven") skip_if_not_installed("withr") skip_if_not_installed("curl") skip_if_offline() withr::with_tempfile("temp_file", fileext = ".sav", code = { request <- httr::GET( "https://raw.github.com/easystats/circus/main/data/EFC.sav" ) httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) d <- haven::read_spss(temp_file) x <- to_numeric(d$c172code) expect_identical(as.vector(table(x)), c(180L, 506L, 156L)) }) }) test_that("to_numeric preserves correct label order", { x <- factor(c(1, 2, 3, 4)) x <- assign_labels(x, values = c("one", "two", "three", "four")) out <- to_numeric(x, dummy_factors = FALSE) expect_identical( attributes(out)$labels, c(one = 1, two = 2, three = 3, four = 4) ) # correctly reverse scale out <- to_numeric(reverse_scale(x), dummy_factors = FALSE) expect_identical( attributes(out)$labels, c(one = 4, two = 3, three = 2, four = 1) ) # factor with alphabetical values x <- factor(letters[1:4]) x <- assign_labels(x, values = c("one", "two", "three", "four")) out <- to_numeric(x, dummy_factors = FALSE) expect_identical( attributes(out)$labels, c(one = 1, two = 2, three = 3, four = 4) ) # correctly reverse scale out <- to_numeric(reverse_scale(x), dummy_factors = FALSE) expect_identical( attributes(out)$labels, c(one = 4, two = 3, three = 2, four = 1) ) }) ================================================ FILE: tests/testthat/test-data_to_wide.R ================================================ test_that("data_to_wide works", { long_data <- data.frame( Row_ID = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5), name = c( "X1", "X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2", "X2", "X3", "X3", "X3", "X3", "X3" ), value = c(3L, 2L, 5L, 4L, 1L, 3L, 1L, 2L, 5L, 4L, 2L, 3L, 1L, 4L, 5L), stringsAsFactors = FALSE ) expect_equal( data_to_wide( long_data, names_from = "name", values_from = "value", id_cols = "Row_ID" ), data.frame( Row_ID = c(1, 2, 3, 4, 5), X1 = c(3L, 2L, 5L, 4L, 1L), X2 = c(3L, 1L, 2L, 5L, 4L), X3 = c(2L, 3L, 1L, 4L, 5L), stringsAsFactors = FALSE ), ignore_attr = TRUE, tolerance = 1e-3 ) long_data$X1 <- 5 expect_error( data_to_wide( long_data, names_from = "name", values_from = "value", id_cols = "Row_ID" ), regexp = "Some values of the columns specified in `names_from`" ) }) test_that("data_to_wide, names_prefix works", { skip_if_not_installed("tidyr") out <- data_to_wide( tidyr::fish_encounters, names_from = "station", values_from = "seen", names_prefix = "foo_" ) expect_named( out, c( "fish", "foo_Release", "foo_I80_1", "foo_Lisbon", "foo_Rstr", "foo_Base_TD", "foo_BCE", "foo_BCW", "foo_BCE2", "foo_BCW2", "foo_MAE", "foo_MAW" ) ) }) test_that("data_to_wide, values_fill deprecated", { skip_if_not_installed("tidyr") expect_warning( data_to_wide( tidyr::fish_encounters, names_from = "station", values_from = "seen", values_fill = c(1, 2) ), regexp = "`values_fill` is defunct", fixed = TRUE ) }) # EQUIVALENCE WITH TIDYR - PIVOT_WIDER ----------------------------------------------- # Examples coming from: https://tidyr.tidyverse.org/articles/pivot.html#wider # and from https://github.com/tidyverse/tidyr/blob/main/tests/testthat/test-pivot-wide.R ### From tidyr tests test_that("can pivot all cols to wide", { skip_if_not_installed("tidyr") df <- tidyr::tibble(key = c("x", "y", "z"), val = 1:3) pv <- data_to_wide(df, names_from = "key", values_from = "val") expect_named(pv, c("x", "y", "z")) expect_identical(nrow(pv), 1L) }) test_that("non-pivoted cols are preserved", { skip_if_not_installed("tidyr") df <- tidyr::tibble(a = 1, key = c("x", "y"), val = 1:2) pv <- data_to_wide(df, names_from = "key", values_from = "val") expect_named(pv, c("a", "x", "y")) expect_identical(nrow(pv), 1L) }) test_that("implicit missings turn into explicit missings", { skip_if_not_installed("tidyr") df <- tidyr::tibble(a = 1:2, key = c("x", "y"), val = 1:2) pv <- data_to_wide(df, names_from = "key", values_from = "val") expect_identical(pv$a, c(1L, 2L)) expect_identical(pv$x, c(1L, NA)) expect_identical(pv$y, c(NA, 2L)) }) test_that("error when overwriting existing column", { skip_if_not_installed("tidyr") df <- tidyr::tibble( a = c(1, 1), key = c("a", "b"), val = c(1, 2) ) expect_error( data_to_wide(df, names_from = "key", values_from = "val"), regexp = "Some values of the columns specified" ) }) test_that("data_to_wide: fill values, #293", { skip_if_not_installed("tidyr") weekdays <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") daily <- tidyr::tibble( day = factor(c("Tue", "Thu", "Fri", "Mon"), levels = weekdays), value = c(2, 3, 1, 5), type = factor(c("A", "B", "B", "A")) ) expect_identical( tidyr::pivot_wider( daily, names_from = type, values_from = value ), data_to_wide( daily, names_from = "type", values_from = "value" ) ) }) test_that("data_to_wide, id_cols works correctly, #293", { skip_if_not_installed("tidyr") updates <- tidyr::tibble( county = c("Wake", "Wake", "Wake", "Guilford", "Guilford"), date = c(as.Date("2020-01-01") + 0:2, as.Date("2020-01-03") + 0:1), system = c("A", "B", "C", "A", "C"), value = c(3.2, 4, 5.5, 2, 1.2) ) expect_identical( tidyr::pivot_wider( updates, id_cols = county, names_from = system, values_from = value ), data_to_wide( updates, id_cols = "county", names_from = "system", values_from = "value" ) ) }) ### Examples from tidyr website test_that("data_to_wide equivalent to pivot_wider: ex 1", { skip_if_not_installed("tidyr") x <- tidyr::pivot_wider( tidyr::fish_encounters, names_from = "station", values_from = "seen" ) y <- data_to_wide( tidyr::fish_encounters, names_from = "station", values_from = "seen" ) expect_equal(x, y, ignore_attr = TRUE) }) test_that("data_to_wide equivalent to pivot_wider: ex 2", { skip_if_not_installed("tidyr") production <- tidyr::expand_grid( product = c("A", "B"), country = c("AI", "EI"), year = 2000:2014 ) %>% data_filter((product == "A" & country == "AI") | product == "B") production$production <- rnorm(nrow(production)) x <- production %>% tidyr::pivot_wider( names_from = c(product, country), values_from = production ) y <- production %>% data_to_wide( names_from = c("product", "country"), values_from = "production" ) expect_identical(x, y) }) test_that("data_to_wide equivalent to pivot_wider: ex 3", { skip_if_not_installed("tidyr") x <- tidyr::us_rent_income %>% tidyr::pivot_wider( names_from = variable, values_from = c(estimate, moe) ) y <- tidyr::us_rent_income %>% data_to_wide( names_from = "variable", values_from = c("estimate", "moe") ) expect_identical(x, y) }) test_that("data_to_wide equivalent to pivot_wider: ex 4", { skip_if_not_installed("tidyr") x <- tidyr::us_rent_income %>% tidyr::pivot_wider( names_from = variable, names_sep = ".", values_from = c(estimate, moe) ) y <- tidyr::us_rent_income %>% data_to_wide( names_from = "variable", names_sep = ".", values_from = c("estimate", "moe") ) expect_identical(x, y) }) test_that("data_to_wide equivalent to pivot_wider: ex 5", { skip_if_not_installed("tidyr") contacts <- tidyr::tribble( ~field, ~value, "name", "Jiena McLellan", "company", "Toyota", "name", "John Smith", "company", "google", "email", "john@google.com", "name", "Huxley Ratcliffe" ) contacts$person_id <- cumsum(contacts$field == "name") x <- tidyr::pivot_wider(contacts, names_from = field, values_from = value) y <- data_to_wide(contacts, names_from = "field", values_from = "value") expect_identical(x, y) }) test_that("data_to_wide equivalent to pivot_wider: ex 6", { skip_if_not_installed("tidyr") production <- tidyr::expand_grid( product = c("A", "B"), country = c("AI", "EI"), year = 2000:2014 ) %>% data_filter((product == "A" & country == "AI") | product == "B") production$production <- rnorm(nrow(production)) x <- production %>% tidyr::pivot_wider( names_from = c(product, country), values_from = production, names_glue = "prod_{product}_{country}" ) y <- production %>% data_to_wide( names_from = c("product", "country"), values_from = "production", names_glue = "prod_{product}_{country}" ) expect_identical(x, y) }) test_that("data_to_wide, names_glue works", { skip_if_not_installed("tidyr") df <- data.frame( food = c( "banana", "banana", "banana", "banana", "cheese", "cheese", "cheese", "cheese" ), binary = rep(c("yes", "no"), 4), car = c( "toyota", "subaru", "mazda", "skoda", "toyota", "subaru", "mazda", "skoda" ), fun = c(2, 4, 3, 6, 2, 4, 2, 3), stringsAsFactors = FALSE ) x <- df %>% tidyr::pivot_wider( id_cols = food, names_from = c(car, binary), names_glue = "{binary}_{car}", values_from = fun ) y <- df %>% data_to_wide( id_cols = "food", names_from = c("car", "binary"), names_glue = "{binary}_{car}", values_from = "fun" ) expect_identical(x, y, ignore_attr = TRUE) }) test_that("preserve date format", { skip_if_not_installed("tidyr") family <- tidyr::tibble( family = c(1L, 1L, 2L, 2L, 3L, 3L), child = c( "dob_child1", "dob_child2", "dob_child1", "dob_child2", "dob_child1", "dob_child2" ), value = as.Date(c( "1998-11-26", "2000-01-29", "2004-10-10", NA, "2000-12-05", "2004-04-05" )) ) tidyr <- tidyr::pivot_wider( family, names_from = "child", values_from = "value" ) datawiz <- data_to_wide(family, names_from = "child", values_from = "value") expect_identical(tidyr, datawiz) }) test_that("#293", { skip_if_not_installed("tidyr") weekdays <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") daily <- tidyr::tibble( day = factor(c("Tue", "Thu", "Fri", "Mon"), levels = weekdays), value = c(2, 3, 1, 5) ) expect_identical( tidyr::pivot_wider(daily, names_from = day, values_from = value), data_to_wide(daily, names_from = "day", values_from = "value") ) }) test_that("new names starting with digits are not corrected automatically", { skip_if_not_installed("tidyr") percentages <- tidyr::tibble( year = c(2018, 2019, 2020, 2020), type = factor(c("A", "B", "A", "B"), levels = c("A", "B")), percentage = c(100, 100, 40, 60) ) tidyr <- tidyr::pivot_wider( percentages, names_from = c(year, type), values_from = percentage ) datawiz <- data_to_wide( percentages, names_from = c("year", "type"), values_from = "percentage" ) expect_identical(tidyr, datawiz) }) test_that("Preserve column name when names_from column only has one unique value", { d <- data.frame( Value = rnorm(10), Level = paste0("Participant_", 1:10), Parameter = "Intercept", stringsAsFactors = FALSE ) out <- data_to_wide( d, values_from = "Value", names_from = "Parameter", names_sep = "_" ) expect_named(out, c("Level", "Intercept")) expect_identical(nrow(out), 10L) d <- data.frame( Value = rnorm(10), Level = paste0("Participant_", 1:10), Parameter = c("Intercept", "abc"), stringsAsFactors = FALSE ) out <- data_to_wide( d, values_from = "Value", names_from = "Parameter", names_sep = "_" ) expect_named(out, c("Level", "Intercept", "abc")) expect_identical(nrow(out), 10L) }) test_that("data_to_wide with multiple values_from and unbalanced panel", { skip_if_not_installed("tidyr") long_df <- tidyr::tibble( subject_id = c(1, 1, 2, 2, 3, 5, 4, 4), time = rep(c(1, 2), 4), score = c(10, NA, 15, 12, 18, 11, NA, 14), anxiety = c(5, 7, 6, NA, 8, 4, 5, NA) ) tidyr <- tidyr::pivot_wider( long_df, id_cols = "subject_id", names_from = time, values_from = c(score, anxiety) ) datawiz <- data_to_wide( long_df, id_cols = "subject_id", names_from = "time", values_from = c("score", "anxiety") ) expect_identical(tidyr, datawiz) }) test_that("data_to_wide preserves empty columns", { long_df <- data.frame( subject_id = c(1, 1, 2, 2, 3, 5, 4, 4), time = rep(c(1, 2), 4), score = c(10, NA, 15, 12, 18, 11, NA, 14), anxiety = c(5, 7, 6, NA, 8, 4, 5, NA), test = rep(NA_real_, 8) ) out <- data_to_wide( long_df, id_cols = "subject_id", names_from = "time", values_from = c("score", "anxiety", "test") ) expect_equal( out, data.frame( subject_id = c(1, 2, 3, 5, 4), score_1 = c(10, 15, 18, NA, NA), score_2 = c(NA, 12, NA, 11, 14), anxiety_1 = c(5, 6, 8, NA, 5), anxiety_2 = c(7, NA, NA, 4, NA), test_1 = as.double(c(NA, NA, NA, NA, NA)), test_2 = as.double(c(NA, NA, NA, NA, NA)) ), ignore_attr = TRUE ) }) test_that("data_to_wide, check for valid columns", { long_df <- data.frame( subject_id = c(1, 1, 2, 2, 3, 5, 4, 4), time = rep(c(1, 2), 4), score = c(10, NA, 15, 12, 18, 11, NA, 14), anxiety = c(5, 7, 6, NA, 8, 4, 5, NA), test = rep(NA_real_, 8) ) expect_error( data_to_wide( long_df, id_cols = "id", names_from = "time", values_from = c("score", "anxiety", "test") ), regexp = "`id_cols` must be the names of", fixed = TRUE ) expect_error( data_to_wide( long_df, id_cols = "subject_id", names_from = "times", values_from = c("score", "anxiety", "test") ), regexp = "`names_from` must be the name of", fixed = TRUE ) expect_warning( data_to_wide( long_df, id_cols = "subject_id", names_from = "time", values_from = c("scores", "anxiety", "test") ), regexp = "Following variable(s) were not found", fixed = TRUE ) expect_error( expect_warning(expect_warning(expect_warning( data_to_wide( long_df, id_cols = "subject_id", names_from = "time", values_from = c("a", "b", "c") ) ))), regexp = "No variable defined", fixed = TRUE ) }) test_that("data_to_wide, select helper for values_from", { long_df <- data.frame( subject_id = c(1, 1, 2, 2, 3, 5, 4, 4), time = rep(c(1, 2), 4), score_a = c(10, NA, 15, 12, 18, 11, NA, 14), score_b = c(5, 7, 6, NA, 8, 4, 5, NA), score_c = rep(NA_real_, 8) ) out <- data_to_wide( long_df, id_cols = "subject_id", names_from = "time", values_from = starts_with("score_") ) expect_equal( out, data.frame( subject_id = c(1, 2, 3, 5, 4), score_a_1 = c(10, 15, 18, NA, NA), score_a_2 = c(NA, 12, NA, 11, 14), score_a_1 = c(5, 6, 8, NA, 5), score_a_2 = c(7, NA, NA, 4, NA), score_a_1 = as.double(c(NA, NA, NA, NA, NA)), score_a_2 = as.double(c(NA, NA, NA, NA, NA)) ), ignore_attr = TRUE ) }) ================================================ FILE: tests/testthat/test-data_unique.R ================================================ # Preparations df1 <- data.frame( id = c(1, 2, 3, 1, 3), year = c(2022, 2022, 2022, 2022, 2000), item1 = c(NA, 1, 1, 2, 3), item2 = c(NA, 1, 1, 2, 3), item3 = c(NA, 1, 1, 2, 3) ) expected1 <- data.frame( id = c(1, 2, 3), year = c(2022, 2022, 2022), item1 = c(2, 1, 1), item2 = c(2, 1, 1), item3 = c(2, 1, 1) ) expected2 <- data.frame( id = c(1, 2, 3), year = c(2022, 2022, 2022), item1 = c(NA, 1, 1), item2 = c(NA, 1, 1), item3 = c(NA, 1, 1) ) expected3 <- data.frame( id = c(1, 2, 3), year = c(2022, 2022, 2000), item1 = c(2, 1, 3), item2 = c(2, 1, 3), item3 = c(2, 1, 3) ) expected4 <- data.frame( id = c(1, 2, 3, 3), year = c(2022, 2022, 2022, 2000), item1 = c(2, 1, 1, 3), item2 = c(2, 1, 1, 3), item3 = c(2, 1, 1, 3) ) # Testing test_that("data_unique returns original data if no duplicates", { test <- data.frame(x = c(1, 2), y = c(3, 4)) expect_identical( data_unique(test, c("x", "y"), verbose = FALSE), test ) expect_identical( data_unique(test, "x", verbose = FALSE), test ) }) test_that("data_unique basic", { expect_identical( data_unique(df1, select = "id", verbose = FALSE), expected1 ) }) test_that("data_unique basic method best", { expect_identical( data_unique(df1, select = "id", keep = "best", verbose = FALSE), expected1 ) }) test_that("data_unique basic method first", { expect_identical( data_unique(df1, select = "id", keep = "first", verbose = FALSE), expected2 ) }) test_that("data_unique basic method last", { expect_identical( data_unique(df1, select = "id", keep = "last", verbose = FALSE), expected3 ) }) test_that("data_unique unquoted", { expect_identical( data_unique(df1, select = id, verbose = FALSE), expected1 ) }) test_that("data_unique vector", { expect_identical( data_unique(df1, select = 1, verbose = FALSE), expected1 ) }) test_that("data_unique select-helper", { expect_identical( data_unique(df1, select = contains("id"), verbose = FALSE), expected1 ) }) test_that("data_unique multiple IDs", { x <- data_unique(df1, select = c("id", "year"), verbose = FALSE) rownames(x) <- NULL expect_identical( x, expected4 ) }) test_that("data_unique multiple IDs formula", { x <- data_unique(df1, select = ~ id + year, verbose = FALSE) rownames(x) <- NULL expect_identical( x, expected4 ) }) test_that("data_unique multiple IDs vector", { x <- data_unique(df1, select = 1:2, verbose = FALSE) rownames(x) <- NULL expect_identical( x, expected4 ) }) test_that("data_unique preserve attributes", { attr(df1, "testing") <- "custom.attribute" x <- attributes(data_unique(df1, id, verbose = FALSE)) expect_identical( x$testing, "custom.attribute" ) }) test_that("data_unique, arg 'verbose' works", { expect_message( data_unique(df1, select = ~ id + year), "removed, with method" ) }) test_that("data_unique works with groups", { df <- data.frame( g = c(1, 1, 2, 2), x = c(1, 1, 2, 1) ) df <- data_group(df, "g") expected <- data.frame( g = c(1, 2, 2), x = c(1, 2, 1) ) expected <- data_group(expected, "g") x <- data_unique(df, "x", verbose = FALSE) expect_identical(x, expected, ignore_attr = TRUE) y <- attributes(x) expect_identical(attributes(df)$class, y$class) expect_identical(attributes(df)$groups, y$groups) }) ================================================ FILE: tests/testthat/test-data_unite.R ================================================ d_unite <- data.frame( x = c(NA, 1:3), y = c(letters[1:3], NA_character_), z = 6:9, m = c("X", NA_character_, "Y", "Z"), n = c("NATION", "COUNTRY", "NATION", NA_character_), stringsAsFactors = FALSE ) # for following tests, we need to check for correct column names, # and correct values in new variable test_that("data_unite: simple use case", { # basic out <- data_unite(d_unite, new_column = "xyz") expect_identical(colnames(out), "xyz") expect_identical( out$xyz, c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA") ) # use existing column name out <- data_unite(d_unite, new_column = "x") expect_identical(colnames(out), "x") expect_identical( out$x, c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA") ) # select out <- data_unite(d_unite, new_column = "xyz", select = c("x", "n")) expect_identical( colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "xyz") ) expect_identical( out$xyz, c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA") ) # select, use existing column name out <- data_unite(d_unite, new_column = "x", select = c("x", "n")) expect_identical( colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "x") ) expect_identical( out$x, c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA") ) }) test_that("data_unite: remove_na", { # basic out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE) expect_identical(colnames(out), "xyz") expect_identical( out$xyz, c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z") ) # use existing column name out <- data_unite(d_unite, new_column = "x", remove_na = TRUE) expect_identical(colnames(out), "x") expect_identical( out$x, c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z") ) # select out <- data_unite( d_unite, new_column = "xyz", remove_na = TRUE, select = c("x", "n") ) expect_identical( colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "xyz") ) expect_identical( out$xyz, c("NATION", "1_COUNTRY", "2_NATION", "3") ) # select, use existing column name out <- data_unite( d_unite, new_column = "x", remove_na = TRUE, select = c("x", "n") ) expect_identical( colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "x") ) expect_identical( out$x, c("NATION", "1_COUNTRY", "2_NATION", "3") ) }) test_that("data_unite: append", { # basic out <- data_unite(d_unite, new_column = "xyz", append = TRUE) expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz")) expect_identical( out$xyz, c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA") ) # remove NA out <- data_unite( d_unite, new_column = "xyz", remove_na = TRUE, append = TRUE ) expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz")) expect_identical( out$xyz, c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z") ) # append, using existing column name expect_message({ out <- data_unite(d_unite, new_column = "x", append = TRUE) }) expect_identical(colnames(out), c("x", "y", "z", "m", "n")) expect_identical( out$x, c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA") ) # append, using existing column name, and remove NA expect_message({ out <- data_unite( d_unite, new_column = "x", remove_na = TRUE, append = TRUE ) }) expect_identical(colnames(out), c("x", "y", "z", "m", "n")) expect_identical( out$x, c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z") ) }) test_that("data_unite: combine select and append", { # basic out <- data_unite( d_unite, new_column = "xyz", append = TRUE, select = c("x", "n") ) expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz")) expect_identical( out$xyz, c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA") ) # remove NA out <- data_unite( d_unite, new_column = "xyz", remove_na = TRUE, append = TRUE, select = c("x", "n") ) expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz")) expect_identical( out$xyz, c("NATION", "1_COUNTRY", "2_NATION", "3") ) # append, using existing column name expect_message({ out <- data_unite( d_unite, new_column = "x", append = TRUE, select = c("x", "n") ) }) expect_identical(colnames(out), c("x", "y", "z", "m", "n")) expect_identical( out$x, c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA") ) # append, using existing column name, and remove NA expect_message({ out <- data_unite( d_unite, new_column = "x", remove_na = TRUE, append = TRUE, select = c("x", "n") ) }) expect_identical(colnames(out), c("x", "y", "z", "m", "n")) expect_identical( out$x, c("NATION", "1_COUNTRY", "2_NATION", "3") ) }) test_that("data_unite: errors", { expect_error(data_unite(d_unite), regex = "No name") expect_error( data_unite(d_unite, new_column = c("a", "b")), regex = "a single string" ) expect_error( expect_warning(data_unite(d_unite, new_column = "a", select = "huhu")), regex = "At least" ) }) ================================================ FILE: tests/testthat/test-data_write.R ================================================ skip_if_not_installed("httr") skip_if_not_installed("haven") skip_if_not_installed("readr") skip_on_cran() skip_if_not_installed("curl") skip_if_offline() # prepare data set --------------- data(efc) d <- data_filter(efc, 1:5) d$e42dep <- droplevels(d$e42dep) # data encryption with rds ------------------ test_that("data_write, encrypting rds files", { skip_if_not_installed("withr") skip_if_not_installed("openssl") withr::with_tempfile("tmp", fileext = ".rds", code = { expect_warning(data_write(d, tmp, password = "test"), "Remember") # no password, returns encrypted data frame d2 <- data_read(tmp, verbose = FALSE) expect_named(d2, "out") expect_false(identical(d, d2)) # password, returns decrypted data frame d2 <- data_read(tmp, password = "test") expect_identical(d, d2) # wrong password expect_error(data_read(tmp, password = "text"), "File does not appear") # invalid password arguments expect_error( data_read(tmp, password = c("test", "test2")), regex = "The password must be a single" ) expect_error( data_read(tmp, password = 123), regex = "The password must be a single" ) expect_error( data_read(tmp, password = ""), regex = "The password must be a single" ) expect_error( data_write(d, tmp, password = c("test", "test2")), regex = "The password must be a single" ) expect_error( data_write(d, tmp, password = 123), regex = "The password must be a single" ) expect_error( data_write(d, tmp, password = ""), regex = "The password must be a single" ) # not encrypted data_write(d, tmp) expect_error(data_read(tmp, password = "test"), "File does not appear") # check other decryption functions, should fail when encrypted with datawizard expect_warning(data_write(d, tmp, password = "test")) out <- readRDS(tmp) key <- openssl::sha256(charToRaw("test")) expect_error(openssl::aes_cbc_decrypt(out, key = key)) # check other encryption functions, should fail imported with datawizard x <- serialize(d, NULL) key <- openssl::sha256(charToRaw("test")) saveRDS(openssl::aes_cbc_encrypt(x, key = key), tmp) expect_error(data_read(tmp, password = "test"), "File does not appear") }) }) # data encryption with rdata ------------------ test_that("data_write, encrypting rdata files", { skip_if_not_installed("withr") skip_if_not_installed("openssl") withr::with_tempfile("tmp", fileext = ".rdata", code = { expect_warning(data_write(d, tmp, password = "test"), "Remember") # no password, returns encrypted data frame d2 <- data_read(tmp, verbose = FALSE) expect_named(d2, "out") # password, returns decrypted data frame d2 <- data_read(tmp, password = "test") expect_identical(d, d2) }) }) # data encryption with parquet ------------------ test_that("data_write, encrypting parquet files", { skip_if_not_installed("withr") withr::with_tempfile("tmp", fileext = ".parquet", code = { expect_error( data_write(d, tmp, password = "test"), "Data encryption is not supported" ) }) }) # SPSS ------------------------------------- test_that("data_write, SPSS", { skip_if_not_installed("withr") withr::with_tempfile("tmp", fileext = ".sav", code = { expect_message(data_write(d, tmp)) d2 <- data_read(tmp, verbose = FALSE) expect_equal( to_factor(d, select = c("e16sex", "c172code")), d2, ignore_attr = TRUE ) # data encryption not available for SPSS etc. expect_error( data_write(d, tmp, password = "test"), "Data encryption is not supported" ) }) }) test_that("data_write, SPSS, mixed types of labelled vectors", { skip_if_not_installed("withr") withr::with_tempfile("tmp", fileext = ".sav", code = { d <- data.frame( a = 1:3, b = letters[1:3], c = factor(letters[1:3]), d = as.Date(c("2022-01-01", "2022-02-01", "2022-03-01")), e = c(TRUE, FALSE, FALSE), stringsAsFactors = FALSE ) # Date and Logical cannot be labelled d$a <- assign_labels( d$a, variable = "First", values = c("one", "two", "three") ) d$b <- assign_labels(d$b, variable = "Second", values = c("A", "B", "C")) d$c <- assign_labels( d$c, variable = "Third", values = c("ey", "bee", "see") ) expect_message(data_write(d, tmp), regex = "Preparing") }) }) # Stata ------------------------------------- test_that("data_write, Stata", { skip_if_not_installed("withr") withr::with_tempfile("tmp", fileext = ".dta", code = { data_write(d, tmp, verbose = FALSE) d2 <- data_read(tmp, verbose = FALSE) expect_equal( to_factor(d, select = c("e16sex", "c172code")), d2, ignore_attr = TRUE ) # data encryption not available for SPSS etc. expect_error( data_write(d, tmp, password = "test"), "Data encryption is not supported" ) }) }) # csv ------------------------- test_that("data_write, CSV, keep numeric", { skip_if_not_installed("withr") withr::with_tempfile("tmp", fileext = ".csv", code = { data_write(d, tmp) d2 <- data_read(tmp) expect_equal( to_numeric(d, dummy_factors = FALSE, preserve_levels = TRUE), d2, ignore_attr = TRUE ) # data encryption not available for SPSS etc. expect_error( data_write(d, tmp, password = "test"), "Data encryption is not supported" ) }) }) test_that("data_write, CSV, convert to factor", { skip_if_not_installed("withr") withr::with_tempfile("tmp", fileext = ".csv", code = { data_write(d, tmp, convert_factors = TRUE) d2 <- data_read(tmp) out <- to_factor(d, select = c("e16sex", "c172code")) out$e16sex <- as.character(out$e16sex) out$c172code <- as.character(out$c172code) out$e42dep <- as.numeric(as.character(out$e42dep)) expect_equal(out, d2, ignore_attr = TRUE) }) }) test_that("data_write, CSV, create labels file", { skip_if_not_installed("withr") withr::with_tempfile("tmp", fileext = ".csv", code = { # file for labels fpath <- dirname(tmp) fname <- sub("\\.csv$", "", basename(tmp)) tmp2 <- paste0(fpath, .Platform$file.sep, fname, "_labels.csv") on.exit(unlink(tmp2)) data(efc) expect_message(data_write(efc, tmp, save_labels = TRUE)) d <- data_read(tmp2) expect_identical(d$variable[2], "e16sex") expect_identical(d$label[2], "elder's gender") expect_identical(d$labels[2], "1=male; 2=female") expect_message(data_write(efc, tmp, save_labels = TRUE, delimiter = ";")) d <- data_read(tmp2) expect_identical(d$variable[2], "e16sex") expect_identical(d$label[2], "elder's gender") expect_identical(d$labels[2], "1=male; 2=female") }) }) # invalid file type ------------------------- test_that("data_write, no file extension", { expect_error(data_write(d, "mytestfile")) expect_error(data_write(d, NULL)) }) # writing character vector works for missing value labels ------------------ test_that("data_write, existing variable label but missing value labels", { skip_if_not_installed("withr") withr::with_tempfile("tmp", fileext = ".sav", code = { d <- data.frame( a = letters[1:3], stringsAsFactors = FALSE ) d$a <- assign_labels(d$a, variable = "First") # expect message, but no error expect_message(data_write(d, tmp), regex = "Preparing") # check if data is really the same d2 <- data_read(tmp, verbose = FALSE) expect_identical(d2, d) }) }) ================================================ FILE: tests/testthat/test-demean.R ================================================ test_that("demean works", { df <- iris set.seed(123) df$ID <- sample.int(4, nrow(df), replace = TRUE) # fake-ID set.seed(123) df$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable set.seed(123) x <- demean( df, select = c("Sepal.Length", "Petal.Length"), by = "ID", append = FALSE ) expect_snapshot(head(x)) set.seed(123) expect_message( { x <- demean( df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE ) }, "have been coerced to numeric" ) expect_snapshot(head(x)) set.seed(123) expect_message( { y <- demean( df, select = ~ Sepal.Length + binary + Species, by = ~ID, append = FALSE ) }, "have been coerced to numeric" ) expect_message( { z <- demean( df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE ) }, "have been coerced to numeric" ) expect_identical(y, z) set.seed(123) x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID") expect_named( x, c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "ID", "binary", "Sepal.Length_between", "Petal.Length_between", "Sepal.Length_within", "Petal.Length_within" ) ) expect_snapshot(head(x)) df$Sepal.Length_within <- df$Sepal.Length expect_error( demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID"), regex = "One or more of" ) }) test_that("demean interaction term", { dat <- data.frame( a = c(1, 2, 3, 4, 1, 2, 3, 4), x = c(4, 3, 3, 4, 1, 2, 1, 2), y = c(1, 2, 1, 2, 4, 3, 2, 1), ID = c(1, 2, 3, 1, 2, 3, 1, 2) ) set.seed(123) expect_snapshot(demean( dat, select = c("a", "x*y"), by = "ID", append = FALSE )) }) test_that("demean shows message if some vars don't exist", { dat <- data.frame( a = c(1, 2, 3, 4, 1, 2, 3, 4), x = c(4, 3, 3, 4, 1, 2, 1, 2), y = c(1, 2, 1, 2, 4, 3, 2, 1), ID = c(1, 2, 3, 1, 2, 3, 1, 2) ) set.seed(123) expect_error( demean(dat, select = "foo", by = "ID"), regexp = "not found" ) }) # see issue #520 test_that("demean for cross-classified designs (by > 1)", { skip_if_not_installed("poorman") data(efc, package = "datawizard") dat <- na.omit(efc) dat$e42dep <- factor(dat$e42dep) dat$c172code <- factor(dat$c172code) x2a <- dat %>% data_group(e42dep) %>% data_modify( c12hour_e42dep = mean(c12hour) ) %>% data_ungroup() %>% data_group(c172code) %>% data_modify( c12hour_c172code = mean(c12hour) ) %>% data_ungroup() %>% data_modify( c12hour_within = c12hour - c12hour_e42dep - c12hour_c172code ) out <- degroup( dat, select = "c12hour", by = c("e42dep", "c172code"), suffix_demean = "_within" ) expect_equal( out$c12hour_e42dep_between, x2a$c12hour_e42dep, tolerance = 1e-4, ignore_attr = TRUE ) expect_equal( out$c12hour_within, x2a$c12hour_within, tolerance = 1e-4, ignore_attr = TRUE ) x2a <- dat %>% data_group(e42dep) %>% data_modify( c12hour_e42dep = mean(c12hour, na.rm = TRUE), neg_c_7_e42dep = mean(neg_c_7, na.rm = TRUE) ) %>% data_ungroup() %>% data_group(c172code) %>% data_modify( c12hour_c172code = mean(c12hour, na.rm = TRUE), neg_c_7_c172code = mean(neg_c_7, na.rm = TRUE) ) %>% data_ungroup() %>% data_modify( c12hour_within = c12hour - c12hour_e42dep - c12hour_c172code, neg_c_7_within = neg_c_7 - neg_c_7_e42dep - neg_c_7_c172code ) out <- degroup( dat, select = c("c12hour", "neg_c_7"), by = c("e42dep", "c172code"), suffix_demean = "_within" ) expect_equal( out$c12hour_e42dep_between, x2a$c12hour_e42dep, tolerance = 1e-4, ignore_attr = TRUE ) expect_equal( out$neg_c_7_c172code_between, x2a$neg_c_7_c172code, tolerance = 1e-4, ignore_attr = TRUE ) expect_equal( out$neg_c_7_within, x2a$neg_c_7_within, tolerance = 1e-4, ignore_attr = TRUE ) expect_equal( out$c12hour_within, x2a$c12hour_within, tolerance = 1e-4, ignore_attr = TRUE ) # More than 2 groupings mu <- 100 ul <- setNames(c(-1, -3, 0, 4), nm = letters[1:4]) uL <- setNames(c(10, 30, 0, -40), nm = LETTERS[1:4]) um <- setNames(c(100, 150, -250), nm = month.abb[1:3]) dat <- expand.grid(l = letters[1:4], L = LETTERS[1:4], m = month.abb[1:3]) set.seed(111) e <- rnorm(nrow(dat) - 1) |> round(2) e <- append(e, -sum(e)) dat$y <- mu + ul[dat$l] + uL[dat$L] + um[dat$m] + e dat$z <- mu + ul[dat$l] + uL[dat$L] + um[dat$m] + 10 * e dat_dem <- datawizard::demean( dat, by = c("l", "L", "m"), select = c("y", "z") ) expect_equal(dat_dem$y_l_between, ave(dat$y, dat$l), ignore_attr = TRUE) expect_equal(dat_dem$y_L_between, ave(dat$y, dat$L), ignore_attr = TRUE) expect_equal(dat_dem$y_m_between, ave(dat$y, dat$m), ignore_attr = TRUE) expect_equal(rowSums(dat_dem[grepl("^y_", colnames(dat_dem))]), dat$y) expect_equal(rowSums(dat_dem[grepl("^z_", colnames(dat_dem))]), dat$z) }) test_that("demean, sanity checks", { data(efc, package = "datawizard") dat <- na.omit(efc) dat$e42dep <- factor(dat$e42dep) dat$c172code <- factor(dat$c172code) expect_error( degroup( dat, select = c("c12hour", "neg_c_8"), by = c("e42dep", "c172code"), suffix_demean = "_within" ), regex = "Variable \"neg_c_8\" was not found" ) expect_error( degroup( dat, select = c("c12hour", "neg_c_8"), by = c("e42dep", "c173code"), suffix_demean = "_within" ), regex = "Variables \"neg_c_8\" and \"c173code\" were not found" ) }) test_that("demean for nested designs (by > 1), nested = TRUE", { data(efc, package = "datawizard") dat <- na.omit(efc) dat$e42dep <- factor(dat$e42dep) dat$c172code <- factor(dat$c172code) x_ijk <- dat$c12hour xbar_k <- ave(x_ijk, dat$e42dep, FUN = mean) xbar_jk <- ave(x_ijk, dat$e42dep, dat$c172code, FUN = mean) L3_between <- xbar_k L2_between <- xbar_jk - xbar_k L1_within <- x_ijk - xbar_jk out <- degroup( dat, select = "c12hour", by = c("e42dep", "c172code"), nested = TRUE, suffix_demean = "_within" ) expect_equal( out$c12hour_within, L1_within, tolerance = 1e-4, ignore_attr = TRUE ) expect_equal( out$c12hour_e42dep_between, L3_between, tolerance = 1e-4, ignore_attr = TRUE ) expect_equal( out$c12hour_c172code_between, L2_between, tolerance = 1e-4, ignore_attr = TRUE ) # Following #635 testdf <- data.frame( roman = c("I", "I", "I", "I", "II", "II", "II", "II"), alphabet = c("a", "a", "b", "b", "a", "b", "a", "b"), val1 = c(1, 2, 3, 4, 5, 6, 7, 8), val2 = c(1, 2, 3, 4, 5, 6, 7, 8), val3 = c(1, 2, 3, 4, 5, 6, 7, 8) ) out <- datawizard::demean( testdf, select = c("val1", "val2", "val3"), by = "roman/alphabet", append = FALSE ) expect_named( out, c( "val1_roman_between", "val1_alphabet_between", "val2_roman_between", "val2_alphabet_between", "val3_roman_between", "val3_alphabet_between", "val1_within", "val2_within", "val3_within" ) ) expect_equal( as.vector(out$val1_within), c(-0.5, 0.5, -0.5, 0.5, -1, -1, 1, 1) ) expect_equal(out$val1_within, out$val2_within) expect_equal(out$val1_within, out$val3_within) expect_equal( as.vector(out$val1_roman_between), c(2.5, 2.5, 2.5, 2.5, 6.5, 6.5, 6.5, 6.5) ) expect_equal(out$val1_roman_between, out$val2_roman_between) expect_equal(out$val1_roman_between, out$val3_roman_between) expect_equal( as.vector(out$val1_alphabet_between), c(-1, -1, 1, 1, -0.5, 0.5, -0.5, 0.5) ) expect_equal(out$val1_alphabet_between, out$val2_alphabet_between) expect_equal(out$val1_alphabet_between, out$val3_alphabet_between) expect_equal(rowSums(out[, grepl("^val1", names(out))]), testdf$val1) }) ================================================ FILE: tests/testthat/test-describe_distribution.R ================================================ skip_if_not_installed("bayestestR") # numeric --------------------------------------- test_that("describe_distribution - numeric: works with basic numeric vector", { x <- describe_distribution(mtcars$mpg) expect_identical(dim(x), c(1L, 9L)) expect_identical(round(x$Mean), 20) }) test_that("describe_distribution - numeric: correctly handles missing values", { no_missing <- describe_distribution(mtcars$mpg) test <- mtcars$mpg test[1] <- NA with_missing <- describe_distribution(test) expect_identical(with_missing$n, 31L) expect_identical(with_missing$n_Missing, 1L) expect_false(with_missing$Mean == no_missing$Mean) }) test_that("describe_distribution - numeric: works with quartiles", { x <- describe_distribution(mtcars$mpg, quartiles = TRUE) expect_identical(dim(x), c(1L, 11L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) test_that("describe_distribution - numeric: works with range", { x <- describe_distribution(mtcars$mpg, range = FALSE) expect_identical(dim(x), c(1L, 7L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) test_that("describe_distribution - NULL for date", { v <- as.Date(c("2022-01-01", "2022-01-02")) expect_warning(expect_null(describe_distribution(v))) }) # data frame --------------------------------------- test_that("describe_distribution - data frame: works with basic data frame", { x <- describe_distribution(mtcars) expect_identical(dim(x), c(11L, 10L)) expect_identical(round(x[1, "Mean"]), 20) }) test_that("describe_distribution - data frame: correctly handles missing values", { no_missing <- describe_distribution(mtcars) test <- mtcars test[1, ] <- NA with_missing <- describe_distribution(test) expect_identical(unique(with_missing$n), 31L) expect_identical(unique(with_missing$n_Missing), 1L) expect_false(unique(with_missing$Mean == no_missing$Mean)) }) test_that("describe_distribution - data frame: works with quartiles", { x <- describe_distribution(mtcars, quartiles = TRUE) expect_identical(dim(x), c(11L, 12L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) test_that("describe_distribution - data frame: works with range", { x <- describe_distribution(mtcars, range = FALSE) expect_identical(dim(x), c(11L, 8L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) # factor --------------------------------------- test_that("describe_distribution - factor", { expect_snapshot(describe_distribution(factor(substring( "statistics", 1:10, 1:10 )))) }) # character --------------------------------------- test_that("describe_distribution - character", { expect_snapshot(describe_distribution(as.character(ToothGrowth$supp))) }) # list --------------------------------------- test_that("describe_distribution - list: works with basic list", { x <- list(mtcars$mpg, mtcars$cyl) stored <- describe_distribution(x) unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl)) named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl)) mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl)) expect_identical(dim(stored), c(2L, 10L)) expect_identical(round(stored$Mean), c(20, 6)) expect_identical(dim(unnamed), c(2L, 10L)) expect_identical(round(unnamed$Mean), c(20, 6)) expect_identical(dim(named), c(2L, 10L)) expect_identical(round(named$Mean), c(20, 6)) expect_identical(dim(mix), c(2L, 10L)) expect_identical(round(mix$Mean), c(20, 6)) }) test_that("describe_distribution - list: works with include_factors", { x1 <- describe_distribution(list(mtcars$mpg, factor(mtcars$cyl))) y <- describe_distribution(list(mtcars$mpg)) expect_identical(x1, y) x2 <- describe_distribution( list(mtcars$mpg, factor(mtcars$cyl)), include_factors = TRUE ) expect_identical(dim(x2), c(2L, 10L)) expect_identical(x2$Variable, c("mtcars$mpg", "factor(mtcars$cyl)")) x3 <- describe_distribution( list(mtcars$mpg, foo = factor(mtcars$cyl)), include_factors = TRUE ) expect_identical(dim(x3), c(2L, 10L)) expect_identical(x3$Variable, c("mtcars$mpg", "foo")) }) test_that("describe_distribution - list: correctly removes character elements", { x <- describe_distribution(list(mtcars$mpg, "something")) y <- describe_distribution(list(mtcars$mpg)) expect_identical(x, y) }) test_that("describe_distribution - list: correctly handles variable names", { x <- list(mtcars$mpg, mtcars$cyl) stored <- describe_distribution(x) unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl)) named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl)) mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl)) expect_identical(stored$Variable, c("Var_1", "Var_2")) expect_identical(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl")) expect_identical(named$Variable, c("foo", "foo2")) expect_identical(mix$Variable, c("foo", "mtcars$cyl")) }) test_that("describe_distribution - list: correctly handles missing values", { no_missing <- describe_distribution(list(mtcars$mpg, mtcars$cyl)) test <- mtcars$mpg test2 <- mtcars$cyl test[1] <- NA test2[1] <- NA with_missing <- describe_distribution(list(test, test2)) expect_identical(unique(with_missing$n), 31L) expect_identical(unique(with_missing$n_Missing), 1L) expect_false(unique(with_missing$Mean == no_missing$Mean)) }) test_that("describe_distribution - list: works with quartiles", { x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), quartiles = TRUE) expect_identical(dim(x), c(2L, 12L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) test_that("describe_distribution - list: works with range", { x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), range = FALSE) expect_identical(dim(x), c(2L, 8L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) # select ---------------------- test_that("describe_distribution - select", { data(iris) out <- describe_distribution(iris, select = starts_with("Petal")) expect_identical(out$Variable, c("Petal.Length", "Petal.Width")) expect_equal(out$Mean, c(3.758000, 1.199333), tolerance = 1e-3) expect_null(describe_distribution(iris, select = "Species")) out <- describe_distribution(iris, select = "Species", include_factors = TRUE) exp <- describe_distribution(iris$Species) expect_identical(out$Range, exp$Range) }) # select and grouped df ---------------------- test_that("describe_distribution - grouped df", { data(iris) x <- data_group(iris, Species) out <- describe_distribution(x, select = starts_with("Petal")) expect_snapshot(out) expect_equal( out$Mean, c(1.462, 0.246, 4.26, 1.326, 5.552, 2.026), tolerance = 1e-3 ) }) # Mostly to test printing test_that("describe_distribution - grouped df and multiple groups", { x <- data.frame( grp1 = rep(letters[1:3], each = 20), grp2 = rep(letters[1:3], 20), values = 1:30 ) x <- data_group(x, c("grp1", "grp2")) expect_snapshot(describe_distribution(x)) }) test_that("argument 'by' works", { # basic grouped <- data_group(mtcars, c("am", "vs")) expect_identical( describe_distribution(grouped), describe_distribution(mtcars, by = c("am", "vs")), ignore_attr = TRUE ) # mixing data_group() and arg 'by' grouped <- data_group(mtcars, c("am", "vs")) half_grouped <- data_group(mtcars, "am") expect_identical( describe_distribution(grouped), describe_distribution(half_grouped, by = "vs"), ignore_attr = TRUE ) expect_error( describe_distribution(mtcars, by = 2), "must be a character vector" ) }) test_that("empty groups are discarded from the output, #608", { dat <- data.frame( grp1 = factor("a", levels = c("a", "b")), grp2 = factor(c("A", "B")), value = 1:2 ) dat <- data_group(dat, c("grp1", "grp2")) expect_no_error( suppressWarnings(describe_distribution(dat, ci = 0.95)) ) }) # distribution_mode -------------------------- test_that("distribution_mode works as expected", { # atomic vector expect_identical(distribution_mode(c(1, 2, 3, 3, 4, 5)), 3) expect_identical(distribution_mode(c(1, 2, 3, 3, 4, 4, 5)), 3) expect_identical(distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)), 3.7) # list expect_identical(distribution_mode(list(1, 2, 3, 3, 4, 5)), list(3)) # scalar expect_identical(distribution_mode("a"), "a") # empty expect_null(distribution_mode(NULL)) }) # select helpers ------------------------------ test_that("describe_distribution regex", { expect_equal( describe_distribution(mtcars, select = "pg", regex = TRUE), describe_distribution(mtcars, select = "mpg"), ignore_attr = TRUE ) }) # formatting ------------------------------ test_that("describe_distribution formatting", { data(iris) x <- describe_distribution(iris$Sepal.Width, quartiles = TRUE) expect_snapshot(format(x)) }) # other ----------------------------------- test_that("return NA in CI if sample is too sparse", { set.seed(123456) expect_silent(expect_message( { res <- describe_distribution( mtcars[mtcars$cyl == "6", ], wt, centrality = "map", ci = 0.95 ) }, regex = "Bootstrapping" )) expect_equal(res$CI_low_map, 2.6462, tolerance = 1e-2) expect_equal(res$CI_high_map, 3.4531, tolerance = 1e-2) x <- c( 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5 ) expect_message( { out <- describe_distribution(x, centrality = "map") }, regex = "Could not calculate" ) expect_identical(out$MAP, NA_real_) expect_silent(describe_distribution(x, centrality = "map", verbose = FALSE)) }) # check for reserved column names test_that("errors on invalid column names (reserved word)", { data(mtcars) out <- data_to_long( mtcars, cols = 1:3, names_to = "Variable", values_to = "Values" ) out <- data_group(out, c("gear", "Variable")) expect_error( describe_distribution(out, select = "Values"), regex = "Following variable names are reserved" ) out <- data_to_long( mtcars, cols = 1:3, names_to = "Variable", values_to = "Values" ) expect_error( describe_distribution(out, select = "Variable"), regex = "Following variable names are reserved" ) }) # multiple centralities test_that("multiple centralities work", { data(iris) out <- describe_distribution( iris, select = "Petal.Width", centrality = c("median", "mean") ) expect_identical(dim(out), c(1L, 12L)) expect_named( out, c( "Variable", "Median", "MAD", "Mean", "SD", "IQR", "Min", "Max", "Skewness", "Kurtosis", "n", "n_Missing" ) ) out <- describe_distribution( iris, select = "Petal.Width", centrality = list("median", "mean") ) expect_identical(dim(out), c(1L, 12L)) expect_named( out, c( "Variable", "Median", "MAD", "Mean", "SD", "IQR", "Min", "Max", "Skewness", "Kurtosis", "n", "n_Missing" ) ) }) test_that("(multiple) centralities with CIs", { data(iris) x <- iris$Sepal.Width set.seed(123456) expect_message( { out <- describe_distribution( x, centrality = "all", ci = 0.95, iterations = 100 ) }, regex = "For more stable intervals" ) expect_named( out, c( "Median", "MAD", "Mean", "SD", "MAP", "IQR", "CI_low_mean", "CI_high_mean", "CI_low_median", "CI_high_median", "CI_low_MAP", "CI_high_MAP", "Min", "Max", "Skewness", "Kurtosis", "n", "n_Missing" ) ) expect_snapshot(print(out, table_width = Inf)) expect_silent(describe_distribution( x, centrality = "all", ci = 0.95, iterations = 100, verbose = FALSE )) set.seed(123456) out <- describe_distribution( x, centrality = "mean", ci = 0.95, iterations = 100, verbose = FALSE ) expect_named( out, c( "Mean", "SD", "IQR", "CI_low_mean", "CI_high_mean", "Min", "Max", "Skewness", "Kurtosis", "n", "n_Missing" ) ) expect_snapshot(print(out, table_width = Inf)) set.seed(123456) out <- describe_distribution( x, centrality = c("MAP", "median"), ci = 0.95, iterations = 100, verbose = FALSE ) expect_named( out, c( "Median", "MAD", "MAP", "IQR", "CI_low_MAP", "CI_high_MAP", "CI_low_median", "CI_high_median", "Min", "Max", "Skewness", "Kurtosis", "n", "n_Missing" ) ) expect_snapshot(print(out, table_width = Inf)) # only one message for data frame expect_silent(expect_message(describe_distribution(iris, ci = 0.95))) }) test_that("display() method exports to markdown", { skip_if_not_installed("knitr") data(iris) out <- describe_distribution(iris) expect_error(display(out, format = "invalid"), regex = "Invalid option") expect_snapshot(display(out)) }) test_that("display() method exports to tinytable", { skip_if_not_installed("tinytable") data(iris) out <- describe_distribution(iris) expect_snapshot(display(out, format = "tt")) }) ================================================ FILE: tests/testthat/test-distributions.R ================================================ test_that("distributions", { skip_if_not_installed("bayestestR") skip_if_not_installed("parameters") set.seed(123) x <- bayestestR::distribution_normal(100) expect_equal(kurtosis(x)$Kurtosis, -0.1119534, tolerance = 0.01) expect_equal(skewness(x)$Skewness, -5.881466e-17, tolerance = 0.01) expect_equal(as.numeric(smoothness(x, "diff")), 1.183699, tolerance = 0.01) expect_equal(as.numeric(smoothness(x, "cor")), 0.9979799, tolerance = 0.01) }) ================================================ FILE: tests/testthat/test-empty-dataframe.R ================================================ test_that("remove empty with character", { tmp <- data.frame( a = c(1, 2, 3, NA, 5), b = c(1, NA, 3, NA, 5), c = c(NA, NA, NA, NA, NA), d = c(1, NA, 3, NA, 5) ) expect_identical(empty_columns(tmp), c(c = 3L)) expect_identical(empty_rows(tmp), 4L) expect_identical(dim(remove_empty_columns(tmp)), c(5L, 3L)) expect_identical(dim(remove_empty_rows(tmp)), c(4L, 4L)) expect_identical(dim(remove_empty(tmp)), c(4L, 3L)) expect_snapshot(remove_empty_columns(tmp)) expect_snapshot(remove_empty_rows(tmp)) expect_snapshot(remove_empty(tmp)) }) test_that("remove empty columns with character", { tmp <- data.frame( a = c(1, 2, 3, NA, 5), b = c("", NA, "", NA, ""), c = c(NA, NA, NA, NA, NA), d = c(1, NA, 3, NA, 5), e = c("", "", "", "", ""), stringsAsFactors = FALSE ) expect_identical(empty_columns(tmp), c(b = 2L, c = 3L, e = 5L)) expect_identical(dim(remove_empty_columns(tmp)), c(5L, 2L)) expect_identical(dim(remove_empty(tmp)), c(4L, 2L)) }) test_that("remove empty rows with character", { tmp <- data.frame( a = c(1, "", 3, NA, 5), b = c("", NA, "", NA, ""), c = c(NA, NA, NA, NA, NA), d = c(1, NA, 3, NA, 5), e = c("", "", "", "", ""), f = factor(c("", "", "", "", "")), g = factor(c("", NA, "", NA, "")), stringsAsFactors = FALSE ) expect_identical(empty_rows(tmp), c(2L, 4L)) expect_identical(dim(remove_empty_rows(tmp)), c(3L, 7L)) expect_identical(dim(remove_empty(tmp)), c(3L, 2L)) }) test_that("empty_columns with only NA characters", { tmp <- data.frame( var1 = c(1, 1, 1), var2 = c(NA_character_, NA_character_, NA_character_) ) expect_identical(empty_columns(tmp), c(var2 = 2L)) }) test_that("works with non-ascii chars", { tmp <- data.frame( a = c(1, 2, 3, NA, 5), b = c("", NA, "", NA, ""), c = c(NA, NA, NA, NA, NA), d = c("test", "Se\x96ora", "works fine", "this too", "yeah"), e = c("", "", "", "", ""), stringsAsFactors = FALSE ) expect_identical(empty_columns(tmp), c(b = 2L, c = 3L, e = 5L)) }) ================================================ FILE: tests/testthat/test-extract_column_names.R ================================================ test_that("extract_column_names works as expected", { expect_identical( extract_column_names(iris, starts_with("Sepal")), c("Sepal.Length", "Sepal.Width") ) expect_identical( extract_column_names(iris, starts_with("Sepal", "Petal")), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_identical( extract_column_names(iris, ends_with("Width")), c("Sepal.Width", "Petal.Width") ) expect_identical( extract_column_names(iris, ends_with("Length", "Width")), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_identical( extract_column_names(iris, regex("\\.")), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_identical( extract_column_names(iris, c("Petal.Width", "Sepal.Length")), c("Petal.Width", "Sepal.Length") ) expect_identical( extract_column_names(iris, contains("Wid")), c("Sepal.Width", "Petal.Width") ) expect_identical( extract_column_names(iris, contains("en", "idt")), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_identical( extract_column_names(mtcars, c("am", "gear", "cyl")), c("am", "gear", "cyl") ) expect_identical( extract_column_names(mtcars, c("vam", "gear", "cyl")), c("gear", "cyl") ) expect_warning(expect_null(extract_column_names(mtcars, ends_with("abc")))) expect_identical( extract_column_names(mtcars, regex("rb$")), "carb" ) expect_identical( extract_column_names(mtcars, regex("^c")), c("cyl", "carb") ) expect_warning(expect_null(extract_column_names(mtcars, "^c"))) expect_identical( extract_column_names(mtcars, regex("^C"), ignore_case = TRUE), c("cyl", "carb") ) expect_identical( extract_column_names(iris, "Width$", regex = TRUE), c("Sepal.Width", "Petal.Width") ) }) test_that("extract_column_names from other functions", { test_fun1 <- function(data, i) { extract_column_names(data, select = i) } expect_identical( test_fun1(iris, c("Sepal.Length", "Sepal.Width")), c("Sepal.Length", "Sepal.Width") ) expect_identical( test_fun1(iris, starts_with("Sep")), c("Sepal.Length", "Sepal.Width") ) test_fun1a <- function(data, i) { extract_column_names(data, select = i, regex = TRUE) } expect_identical( test_fun1a(iris, "Sep"), c("Sepal.Length", "Sepal.Width") ) test_fun1b <- function(data, i) { extract_column_names(data, select = i, regex = TRUE) } expect_identical( test_fun1b(iris, "Width$"), c("Sepal.Width", "Petal.Width") ) test_fun2 <- function(data) { extract_column_names(data, select = starts_with("Sep")) } expect_identical( test_fun2(iris), c("Sepal.Length", "Sepal.Width") ) test_fun3 <- function(data) { i <- "Sep" extract_column_names(data, select = starts_with(i)) } expect_identical( test_fun3(iris), c("Sepal.Length", "Sepal.Width") ) }) test_that("extract_column_names regex", { expect_identical( extract_column_names(mtcars, select = "pg", regex = TRUE), extract_column_names(mtcars, select = "mpg") ) }) test_that("extract_column_names works correctly with minus sign", { expect_identical( extract_column_names(iris, -"Sepal.Length"), c("Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) expect_identical( extract_column_names(iris, -c("Sepal.Length", "Petal.Width")), c("Sepal.Width", "Petal.Length", "Species") ) expect_identical( extract_column_names(iris, -1), c("Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) expect_error( extract_column_names(iris, -1:2), regexp = "can't mix negative" ) expect_identical( extract_column_names(iris, -(1:2)), c("Petal.Length", "Petal.Width", "Species") ) expect_identical( extract_column_names(iris, -c(1, 3)), c("Sepal.Width", "Petal.Width", "Species") ) expect_identical( extract_column_names(iris, -starts_with("Sepal", "Petal")), "Species" ) expect_identical( extract_column_names(iris, -ends_with("Length", "Width")), "Species" ) expect_identical( extract_column_names(iris, -contains("en", "idt")), "Species" ) expect_identical( extract_column_names( iris, -c("Sepal.Length", "Petal.Width"), exclude = "Species" ), c("Sepal.Width", "Petal.Length") ) }) test_that("extract_column_names with square brackets", { expect_identical( extract_column_names(mtcars, select = names(mtcars)[-1]), extract_column_names(mtcars, select = 2:11) ) }) ================================================ FILE: tests/testthat/test-labelled_data.R ================================================ data(efc, package = "datawizard") # reverse ----------------------------------- test_that("reverse, labels preserved", { # factor, label expect_identical( attr(reverse(efc$e42dep), "label", exact = TRUE), "elder's dependency" ) # factor, labels expect_named( attr(reverse(efc$e42dep), "labels", exact = TRUE), names(attr(efc$e42dep, "labels", exact = TRUE)) ) expect_equal( attr(reverse(efc$e42dep), "labels", exact = TRUE), rev(attr(efc$e42dep, "labels", exact = TRUE)), ignore_attr = TRUE ) # numeric expect_named( attr(reverse(efc$c12hour), "labels", exact = TRUE), names(attr(efc$c12hour, "labels", exact = TRUE)) ) # data frame labels <- sapply(reverse(efc), attr, which = "label", exact = TRUE) expect_identical( labels, c( c12hour = "average number of hours of care per week", e16sex = "elder's gender", e42dep = "elder's dependency", c172code = "carer's level of education", neg_c_7 = "Negative impact with 7 items" ) ) }) # data_merge ----------------------------------- test_that("data_merge, labels preserved", { labels <- sapply( data_merge(efc[1:2], efc[3:4], verbose = FALSE), attr, which = "label", exact = TRUE ) expect_identical( labels, c( c12hour = "average number of hours of care per week", e16sex = "elder's gender", e42dep = "elder's dependency", c172code = "carer's level of education" ) ) }) # data_extract ----------------------------------- test_that("data_extract, labels preserved", { # factor expect_equal( attr(data_extract(efc, select = "e42dep"), "labels", exact = TRUE), attr(efc$e42dep, "labels", exact = TRUE), ignore_attr = TRUE ) # numeric expect_equal( attr(data_extract(efc, select = "c172code"), "labels", exact = TRUE), attr(efc$c172code, "labels", exact = TRUE), ignore_attr = TRUE ) # data frame labels <- sapply( data_extract(efc, select = c("e42dep", "c172code")), attr, which = "label", exact = TRUE ) expect_identical( labels, c(e42dep = "elder's dependency", c172code = "carer's level of education") ) }) # categorize ----------------------------------- test_that("categorize, labels preserved", { # factor expect_equal( attr(categorize(efc$e42dep), "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) # numeric expect_equal( attr(categorize(efc$c12hour), "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) }) # data_reorder ----------------------------------- test_that("data_reorder, labels preserved", { expect_equal( attr(data_reorder(efc, "e42dep")[[1]], "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) }) # data_remove ----------------------------------- test_that("data_remove, labels preserved", { expect_equal( attr(data_remove(efc, "e42dep")[[1]], "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) }) # data_rename ----------------------------------- test_that("data_rename, labels preserved", { # factor x <- data_rename(efc, "e42dep", "dependency") expect_equal( attr(x$dependency, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) # numeric x <- data_rename(efc, "c12hour", "careload") expect_equal( attr(x$careload, "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) # data frame labels <- sapply( data_remove(efc, starts_with("c1")), attr, which = "label", exact = TRUE ) expect_identical( labels, c( e16sex = "elder's gender", e42dep = "elder's dependency", neg_c_7 = "Negative impact with 7 items" ) ) }) # data_addprefix ----------------------------------- test_that("data_addprefix, labels preserved", { x <- data_addprefix(efc, "new_") # factor expect_equal( attr(x$new_e42dep, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) # numeric expect_equal( attr(x$new_c12hour, "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) }) # data_suffix ----------------------------------- test_that("data_addsuffix, labels preserved", { x <- data_addsuffix(efc, "_new") # factor expect_equal( attr(x$e42dep_new, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) # numeric expect_equal( attr(x$c12hour_new, "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) }) # to_numeric ----------------------------------- test_that("to_numeric, labels preserved", { x <- to_numeric(efc, dummy_factors = FALSE) # factor expect_equal( attr(x$e42dep, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) # numeric expect_equal( attr(x$c12hour, "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) x <- to_numeric(efc, dummy_factors = TRUE) # numeric expect_equal( attr(x$c12hour, "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) }) # data_match ----------------------------------- test_that("data_match, labels preserved", { x <- data_match(efc, data.frame(c172code = 1, e16sex = 2), match = "or") # factor expect_equal( attr(x$e42dep, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) # numeric expect_equal( attr(x$c12hour, "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) # filtered expect_equal( attr(x$c172code, "label", exact = TRUE), attr(efc$c172code, "label", exact = TRUE), ignore_attr = TRUE ) expect_equal( attr(x$c172code, "labels", exact = TRUE), attr(efc$c172code, "labels", exact = TRUE), ignore_attr = TRUE ) }) # data_filter ----------------------------------- test_that("data_filter, labels preserved", { x <- data_filter(efc, c172code == 1 & c12hour > 40) # factor expect_identical( attr(x$e42dep, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE) ) # numeric expect_identical( attr(x$c12hour, "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE) ) }) # convert_to_na ----------------------------------- test_that("convert_to_na, labels preserved", { expect_message({ x <- convert_to_na(efc, na = c(2, "2"), select = starts_with("e")) }) # factor expect_equal( attr(x$e42dep, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) # numeric expect_equal( attr(x$e16sex, "label", exact = TRUE), attr(efc$e16sex, "label", exact = TRUE), ignore_attr = TRUE ) # factor x <- convert_to_na(efc$e42dep, na = "2") expect_equal( attr(x, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) # numeric x <- convert_to_na(efc$e16sex, na = 2) expect_equal( attr(x, "label", exact = TRUE), attr(efc$e16sex, "label", exact = TRUE), ignore_attr = TRUE ) # drop unused value labels x <- convert_to_na(efc$c172code, na = 2) expect_identical( attr(x, "labels", exact = TRUE), c(`low level of education` = 1, `high level of education` = 3) ) }) # data_select ----------------------------------- test_that("data_select, labels preserved", { x <- data_select(efc, starts_with("c")) # numeric expect_equal( attr(x$c12hour, "label", exact = TRUE), attr(efc$c12hour, "label", exact = TRUE), ignore_attr = TRUE ) x <- data_select(efc, starts_with("e")) # factor expect_equal( attr(x$e42dep, "label", exact = TRUE), attr(efc$e42dep, "label", exact = TRUE), ignore_attr = TRUE ) }) # recode_values ----------------------------------- test_that("recode_values, labels preserved", { options(data_recode_pattern = NULL) data(efc) x <- recode_values(efc$c172code, recode = list(`0` = 1:2, `1` = 3)) expect_equal( attr(x, "label", exact = TRUE), attr(efc$c172code, "label", exact = TRUE), ignore_attr = TRUE ) expect_null(attr(x, "labels", exact = TRUE)) }) # slide ----------------------------------- test_that("slide, labels preserved", { data(efc) suppressMessages({ x <- slide(efc) }) expect_equal( attr(x$c172code, "label", exact = TRUE), attr(efc$c172code, "label", exact = TRUE), ignore_attr = TRUE ) expect_null(attr(x$c172code, "labels", exact = TRUE)) x <- slide(efc$c172code) expect_equal( attr(x, "label", exact = TRUE), attr(efc$c172code, "label", exact = TRUE), ignore_attr = TRUE ) }) # to_factor ----------------------------------- test_that("to_factor, labels preserved", { data(efc) x <- to_factor(efc) expect_equal( attr(x$c172code, "label", exact = TRUE), attr(efc$c172code, "label", exact = TRUE), ignore_attr = TRUE ) expect_null(attr(x$c172code, "labels", exact = TRUE)) x <- to_factor(efc$c172code) expect_equal( attr(x, "label", exact = TRUE), attr(efc$c172code, "label", exact = TRUE), ignore_attr = TRUE ) }) ================================================ FILE: tests/testthat/test-labels_to_levels.R ================================================ test_that("labels_to_levels, numeric", { expect_message( labels_to_levels(1:10), regex = "only works" ) }) test_that("labels_to_levels, factor", { data(efc) x <- as.factor(efc$c172code) attr(x, "labels") <- c(low = 1, mid = 2, high = 3) x <- labels_to_levels(x) expect_identical(levels(x), c("low", "mid", "high")) expect_equal(table(x), table(efc$c172code), ignore_attr = TRUE) x <- as.ordered(efc$c172code) attr(x, "labels") <- c(low = 1, mid = 2, high = 3) x <- labels_to_levels(x) expect_identical(levels(x), c("low", "mid", "high")) expect_s3_class(x, "ordered") }) test_that("labels_to_levels, factor, error on no labels", { data(efc) data(iris) x <- as.factor(efc$c172code) expect_error(labels_to_levels(x), regex = "Could not change factor") expect_error(labels_to_levels(iris), regex = "Could not change factor") }) test_that("labels_to_levels, data frame, append", { data(efc) out <- labels_to_levels(efc, append = "_ll") expect_named( out, c("c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_ll") ) }) test_that("labels_to_levels, data frame, append", { data(iris) d <- as.data.frame(lapply(iris, as.factor)) expect_identical(labels_to_levels(d), d) }) test_that("labels_to_levels, factor, data frame", { data(efc) out <- labels_to_levels(efc) expect_identical( levels(out$e42dep), c( "independent", "slightly dependent", "moderately dependent", "severely dependent" ) ) expect_identical(sum(vapply(efc, is.factor, TRUE)), 1L) }) test_that("labels_to_levels, factor, with random value numbers (no sequential order)", { x <- c(5, 5, 1, 3, 1, 7) attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) out <- to_factor(x, labels_to_levels = TRUE) expect_identical( as.character(out), c("don't know", "don't know", "yes", "maybe", "yes", "no") ) expect_identical(levels(out), c("yes", "maybe", "don't know", "no")) x <- c(4, 4, 1, 2, 1, 3) attr(x, "labels") <- c(a = 1, b = 2, c = 3, d = 4) out <- to_factor(x, labels_to_levels = TRUE) expect_identical(as.character(out), c("d", "d", "a", "b", "a", "c")) expect_identical(levels(out), c("a", "b", "c", "d")) x <- c(4, 4, 1, 2, 1, 3) attr(x, "labels") <- c(d = 1, c = 2, b = 3, a = 4) out <- to_factor(x, labels_to_levels = TRUE) expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b")) expect_identical(levels(out), c("d", "c", "b", "a")) x <- c(5, 5, 1, 3, 1, 7) attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) out <- to_factor(x, labels_to_levels = TRUE) expect_identical( out, structure( c(3L, 3L, 1L, 2L, 1L, 4L), levels = c("yes", "maybe", "don't know", "no"), class = "factor" ) ) expect_identical( as.character(out), c("don't know", "don't know", "yes", "maybe", "yes", "no") ) x <- c(5, 5, 1, 3, 1, 7, 4) attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) expect_message( { out <- to_factor(x, labels_to_levels = TRUE) }, regex = "Not all factor levels" ) expect_identical( out, structure( c(4L, 4L, 1L, 2L, 1L, 5L, 3L), levels = c("yes", "maybe", "4", "don't know", "no"), class = "factor" ) ) expect_identical( as.character(out), c("don't know", "don't know", "yes", "maybe", "yes", "no", "4") ) x <- c(5, 5, 1, 3, 1, 7) attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5) expect_message({ out <- to_factor(x, labels_to_levels = TRUE) }) expect_identical( out, structure( c(3L, 3L, 1L, 2L, 1L, 4L), levels = c("yes", "3", "don't know", "no"), class = "factor" ) ) expect_identical( as.character(out), c("don't know", "don't know", "yes", "3", "yes", "no") ) x <- c(5, 5, 1, 3, 1, 7, 6) attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5) expect_message({ out <- to_factor(x, labels_to_levels = TRUE) }) expect_identical( out, structure( c(3L, 3L, 1L, 2L, 1L, 5L, 4L), levels = c("yes", "3", "don't know", "6", "no"), class = "factor" ) ) expect_identical( as.character(out), c("don't know", "don't know", "yes", "3", "yes", "no", "6") ) }) ================================================ FILE: tests/testthat/test-makepredictcall.R ================================================ test_that("makepredictcall", { data("mtcars") nd <- data.frame(hp = c(100, 200)) m1 <- lm(mpg ~ scale(hp, scale = FALSE), mtcars) m2 <- lm(mpg ~ center(hp), mtcars) m3 <- lm(mpg ~ scale(hp), mtcars) m4 <- lm(mpg ~ standardize(hp), mtcars) p1 <- predict(m1, nd) expect_equal(p1, predict(m2, nd)) expect_equal(p1, predict(m3, nd)) expect_equal(p1, predict(m4, nd)) X <- matrix(rnorm(100), ncol = 2) Y <- rnorm(50) expect_error(lm(Y ~ standardize(X)), "matrices") }) test_that("makepredictcall, normalize", { data("mtcars") m1 <- lm(mpg ~ normalize(hp), data = mtcars) m2 <- lm(mpg ~ hp, data = mtcars) m3 <- lm(mpg ~ normalize(hp, include_bounds = FALSE), data = mtcars) out1 <- predict(m1, newdata = data.frame(hp = c(100, 110, 120))) out2 <- predict(m2, newdata = data.frame(hp = c(100, 110, 120))) out3 <- predict(m3, newdata = data.frame(hp = c(100, 110, 120))) expect_equal(out1, out2, tolerance = 1e-3, ignore_attr = TRUE) expect_equal(out1, out3, tolerance = 1e-3, ignore_attr = TRUE) expect_equal(out2, out3, tolerance = 1e-3, ignore_attr = TRUE) out1 <- predict(m1, newdata = data.frame(hp = 110)) out2 <- predict(m2, newdata = data.frame(hp = 110)) out3 <- predict(m3, newdata = data.frame(hp = 110)) expect_equal(out1, out2, tolerance = 1e-3, ignore_attr = TRUE) expect_equal(out1, out3, tolerance = 1e-3, ignore_attr = TRUE) expect_equal(out2, out3, tolerance = 1e-3, ignore_attr = TRUE) }) test_that("makepredictcall, rescale", { data("mtcars") m1 <- lm(mpg ~ rescale(hp, to = c(50, 80)), data = mtcars) m2 <- lm(mpg ~ hp, data = mtcars) m3 <- lm(mpg ~ rescale(hp), data = mtcars) out1 <- predict(m1, newdata = data.frame(hp = c(100, 110, 120))) out2 <- predict(m2, newdata = data.frame(hp = c(100, 110, 120))) out3 <- predict(m3, newdata = data.frame(hp = c(100, 110, 120))) expect_equal(out1, out2, tolerance = 1e-3, ignore_attr = TRUE) expect_equal(out1, out3, tolerance = 1e-3, ignore_attr = TRUE) expect_equal(out2, out3, tolerance = 1e-3, ignore_attr = TRUE) out1 <- predict(m1, newdata = data.frame(hp = 110)) out2 <- predict(m2, newdata = data.frame(hp = 110)) out3 <- predict(m3, newdata = data.frame(hp = 110)) expect_equal(out1, out2, tolerance = 1e-3, ignore_attr = TRUE) expect_equal(out1, out3, tolerance = 1e-3, ignore_attr = TRUE) expect_equal(out2, out3, tolerance = 1e-3, ignore_attr = TRUE) }) ================================================ FILE: tests/testthat/test-mean_sd.R ================================================ test_that("mean_sd", { x <- c(-1, 0, 1) msd <- mean_sd(x) expect_identical(unname(msd), x) expect_named(msd, c("-SD", "Mean", "+SD")) msd <- mean_sd(mtcars[["mpg"]]) mmad <- median_mad(mtcars[["mpg"]]) expect_identical( unname(msd), mean(mtcars[["mpg"]]) + c(-1, 0, 1) * sd(mtcars[["mpg"]]) ) expect_identical( unname(mmad), median(mtcars[["mpg"]]) + c(-1, 0, 1) * mad(mtcars[["mpg"]]) ) msd2 <- mean_sd(mtcars[["mpg"]], times = 3L) expect_length(msd2, n = 3 * 2 + 1) expect_identical(unname(msd2[3:5]), unname(msd)) expect_equal( unname(diff(msd2)), rep(sd(mtcars[["mpg"]]), 6), tolerance = 0.00001 ) expect_named( msd2, c("-3 SD", "-2 SD", "-1 SD", "Mean", "+1 SD", "+2 SD", "+3 SD") ) }) ================================================ FILE: tests/testthat/test-means_by_group.R ================================================ test_that("mean_by_group", { skip_if_not_installed("emmeans") data(efc) expect_snapshot(means_by_group(efc, "c12hour", "e42dep")) expect_snapshot(means_by_group(efc, "c12hour", "e42dep", ci = 0.99)) expect_snapshot(means_by_group(efc, "c12hour", "e42dep", ci = NA)) expect_snapshot(means_by_group(efc, c("neg_c_7", "c12hour"), "e42dep")) expect_snapshot(means_by_group( efc, c("neg_c_7", "c12hour"), "e42dep", ci = NA )) expect_snapshot(means_by_group( efc, c("neg_c_7", "c12hour"), "e42dep", ci = 0.99 )) expect_snapshot(means_by_group(efc$c12hour, efc$e42dep)) expect_snapshot(means_by_group(efc$c12hour, efc$e42dep, ci = NA)) }) test_that("mean_by_group, weighted", { skip_if_not_installed("emmeans") data(efc) set.seed(123) efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) expect_snapshot( means_by_group(efc, "c12hour", "e42dep", weights = "weight"), variant = "windows" ) expect_snapshot( means_by_group(efc, "c12hour", "e42dep", weights = "weight", ci = NA), variant = "windows" ) }) ================================================ FILE: tests/testthat/test-normalize.R ================================================ test_that("normalize work as expected", { expect_equal( normalize(c(0, 1, 5, -5, -2)), c(0.5, 0.6, 1, 0, 0.3), ignore_attr = TRUE ) expect_equal( normalize(c(0, 1, 5, -5, -2), include_bounds = FALSE), c(0.5, 0.58, 0.9, 0.1, 0.34), ignore_attr = TRUE ) expect_equal( normalize(c(0, 1, 5, -5, -2), include_bounds = 0.01), c(0.5, 0.598, 0.99, 0.01, 0.304), ignore_attr = TRUE, tolerance = 1e-4 ) expect_equal( normalize(c(0, 1, 5, -5, -2), include_bounds = "a", verbose = FALSE), c(0.5, 0.6, 1, 0, 0.3), ignore_attr = TRUE, tolerance = 1e-4 ) expect_warning(normalize( c(0, 1, 5, -5, -2), include_bounds = "a", verbose = TRUE )) expect_snapshot(head(normalize(trees))) }) test_that("normalize: only NAs", { expect_equal( normalize(c(NA_real_, NA_real_)), c(NA_real_, NA_real_), ignore_attr = TRUE ) }) test_that("normalize: with Inf", { expect_equal( normalize(c(1, 2, 3, NA, Inf)), c(0, 0.5, 1, NA, Inf), ignore_attr = TRUE ) }) test_that("normalize: with Inf", { expect_equal( normalize(c(1, 2, 3, -Inf, Inf)), c(0, 0.5, 1, -Inf, Inf), ignore_attr = TRUE ) }) test_that("normalize: all Inf", { expect_equal( normalize(c(-Inf, Inf)), c(-Inf, Inf), ignore_attr = TRUE ) }) test_that("normalize: all Na or Inf", { expect_equal( normalize(c(NA, -Inf, NA, Inf)), c(NA, -Inf, NA, Inf), ignore_attr = TRUE ) }) test_that("normalize: only one value", { foo <- 1 expect_warning( normalize(x = foo), regexp = "Variable `foo` contains only one unique value and will" ) expect_warning( { y <- normalize(x = 12) }, regexp = "Variable `12` contains only one unique value and will" ) expect_equal(y, 12, ignore_attr = TRUE) expect_silent(normalize(x = foo, verbose = FALSE)) expect_equal(normalize(x = foo, verbose = FALSE), 1, ignore_attr = TRUE) }) test_that("normalize: only two values", { expect_warning({ y <- normalize(x = c(1, 2)) }) expect_equal(y, c(0, 1), ignore_attr = TRUE) expect_silent(normalize(x = c(1, 2), verbose = FALSE)) expect_equal( normalize(x = c(1, 2), verbose = FALSE), c(0, 1), ignore_attr = TRUE ) }) test_that("normalize: factor", { expect_identical( normalize(factor(1:3)), factor(1:3) ) }) test_that("normalize: matrix", { expect_equal( normalize(matrix(1:4, ncol = 2)), matrix(seq(0, 1, by = 0.3333), ncol = 2), tolerance = 1e-3 ) }) test_that("normalize: select", { skip_if_not_installed("poorman") expect_equal( normalize( iris, select = starts_with("Petal\\.L") ) %>% poorman::pull(Petal.Length), normalize(iris$Petal.Length), ignore_attr = TRUE ) }) test_that("normalize: exclude", { skip_if_not_installed("poorman") expect_identical( normalize( iris, exclude = ends_with("ecies") ), iris %>% normalize(select = 1:4) ) }) test_that("normalize, with append", { out_n <- normalize(iris, "Sepal.Width", append = TRUE) manual <- (iris$Sepal.Width - min(iris$Sepal.Width)) / diff(range(iris$Sepal.Width)) expect_equal(out_n$Sepal.Width_n, manual, ignore_attr = TRUE) }) # with grouped data ------------------------------------------- test_that("normalize (grouped data)", { skip_if_not_installed("poorman") datawizard <- iris %>% poorman::group_by(Species) %>% normalize(Sepal.Width) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) manual <- iris %>% poorman::group_by(Species) %>% poorman::mutate( Sepal.Width = (Sepal.Width - min(Sepal.Width)) / diff(range(Sepal.Width)) ) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) expect_identical(datawizard, manual) }) test_that("normalize (grouped data), with append", { skip_if_not_installed("poorman") datawizard_n <- iris %>% poorman::group_by(Species) %>% normalize(Sepal.Width, append = TRUE) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width_n) manual_n <- iris %>% poorman::group_by(Species) %>% poorman::mutate( Sepal.Width = (Sepal.Width - min(Sepal.Width)) / diff(range(Sepal.Width)) ) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) expect_identical(datawizard_n, manual_n) }) test_that("normalize, include bounds (grouped data)", { skip_if_not_installed("poorman") datawizard <- iris %>% poorman::group_by(Species) %>% normalize(Sepal.Width, include_bounds = TRUE) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) manual <- iris %>% poorman::group_by(Species) %>% poorman::mutate( Sepal.Width = (Sepal.Width - min(Sepal.Width)) / diff(range(Sepal.Width)) ) %>% poorman::ungroup() %>% poorman::pull(Sepal.Width) expect_identical(datawizard, manual) }) test_that("normalize, factor (grouped data)", { skip_if_not_installed("poorman") datawizard <- iris %>% poorman::group_by(Species) %>% normalize(Species) %>% poorman::ungroup() %>% poorman::pull(Species) manual <- iris$Species expect_identical(datawizard, manual) }) # select helpers ------------------------------ test_that("normalize regex", { expect_identical( normalize(mtcars, select = "pg", regex = TRUE), normalize(mtcars, select = "mpg") ) }) ================================================ FILE: tests/testthat/test-print.dw_transformer.R ================================================ test_that("print.dw_transformer", { data(iris) expect_snapshot(rescale(iris$Sepal.Length)) expect_snapshot(normalize(iris$Sepal.Length)) expect_snapshot(center(iris$Sepal.Length)) expect_snapshot(standardize(iris$Sepal.Length)) }) ================================================ FILE: tests/testthat/test-ranktransform.R ================================================ test_that("ranktransform works with NAs", { x <- c(NA_real_, NA_real_) expect_identical(ranktransform(x), x) }) test_that("ranktransform works with factors", { x <- factor(c("apple", "bear", "banana", "dear")) expect_identical(ranktransform(x), x) }) test_that("ranktransform works with unique value vectors", { x <- c(1L, 1L, 1L) expect_identical(suppressWarnings(ranktransform(x)), x) expect_warning( ranktransform(x), "Variable `x` contains only one unique value and will not be normalized." ) }) test_that("ranktransform works with two unique value vectors", { x <- c(1L, 1L, 1L, 2L, 2L, 2L) expect_identical(suppressWarnings(ranktransform(x)), c(2, 2, 2, 5, 5, 5)) expect_warning( ranktransform(x), "Consider converting it" ) }) test_that("signed rank works as expected", { x <- c(-1, 2, -3, 4) sr <- ranktransform(x, sign = TRUE) r <- ranktransform(x, sign = FALSE) expect_identical(sr, x) # unchanged expect_identical(r, c(2, 3, 1, 4)) x <- c(1, -2, -2, 4, 0, 3, -14, 0) expect_warning(ranktransform(x, sign = TRUE)) expect_true(all(is.na(suppressWarnings( ranktransform(x, sign = TRUE)[c(5, 8)] )))) }) test_that("argument 'zeros' works", { x <- c(-1, 0, 2, -3, 4) expect_warning( ranktransform(x, sign = TRUE), "cannot be sign-rank" ) expect_identical( ranktransform(x, sign = TRUE, zeros = "signrank"), c(-2, 0, 3, -4, 5) ) expect_error( ranktransform(x, sign = TRUE, zeros = "foo"), "should be one of" ) }) test_that("ranktransform works with data frames", { set.seed(123) expect_snapshot(ranktransform(BOD)) }) # with grouped data ------------------------------------------- test_that("ranktransform works with data frames (grouped data)", { skip_if_not_installed("poorman") set.seed(123) value1 <- sample.int(20, 9, replace = TRUE) set.seed(456) value2 <- sample.int(20, 9, replace = TRUE) test_df <- data.frame( id = rep(c("A", "B", "C"), each = 3), value1 = value1, value2 = value2, stringsAsFactors = FALSE ) # nolint start: nested_pipe_linter expect_identical( test_df %>% poorman::group_by(id) %>% ranktransform(exclude = "id") %>% poorman::ungroup(), data.frame( id = rep(c("A", "B", "C"), each = 3), value1 = c(2, 3, 1, 1, 2, 3, 2, 1, 3), value2 = c(3, 2, 1, 1, 3, 2, 2, 3, 1), stringsAsFactors = FALSE ) ) # nolint end }) test_that("ranktransform works with data frames containing NAs (grouped data)", { skip_if_not_installed("poorman") set.seed(789) value1 <- sample(c(1:15, NA), 9, replace = TRUE) set.seed(10) value2 <- sample(c(1:15, NA), 9, replace = TRUE) test_df <- data.frame( id = rep(c("A", "B", "C"), each = 3), value1 = value1, value2 = value2, stringsAsFactors = FALSE ) # nolint start: nested_pipe_linter expect_identical( test_df %>% poorman::group_by(id) %>% ranktransform(exclude = "id") %>% poorman::ungroup(), data.frame( id = rep(c("A", "B", "C"), each = 3), value1 = c(2, NA, 1, 1, 3, 2, 2, NA, 1), value2 = c(3, 1, 2, NA, 2, 1, 3, 1, 2), stringsAsFactors = FALSE ) ) # nolint end }) # select helpers ------------------------------ test_that("ranktransform regex", { expect_identical( ranktransform(mtcars, select = "pg", regex = TRUE), ranktransform(mtcars, select = "mpg") ) }) ================================================ FILE: tests/testthat/test-recode_into.R ================================================ test_that("recode_into", { x <- 1:10 out <- recode_into( x > 5 ~ "a", x > 2 & x <= 5 ~ "b", default = "c" ) expect_identical(out, c("c", "c", "b", "b", "b", "a", "a", "a", "a", "a")) }) test_that("recode_into, overwrite", { x <- 1:30 expect_warning( recode_into( x > 1 ~ "a", x > 10 & x <= 15 ~ "b", default = "c", overwrite = TRUE ), regex = "overwritten" ) # validate results x <- 1:10 expect_silent({ out <- recode_into( x >= 3 & x <= 7 ~ 1, x > 5 ~ 2, default = 0, verbose = FALSE ) }) expect_identical(out, c(0, 0, 1, 1, 1, 2, 2, 2, 2, 2)) expect_warning( recode_into( x >= 3 & x <= 7 ~ 1, x > 5 ~ 2, default = 0 ), regex = "case 6" ) x <- 1:10 expect_silent({ out <- recode_into( x >= 3 & x <= 7 ~ 1, x > 5 ~ 2, default = 0, overwrite = FALSE, verbose = FALSE ) }) expect_identical(out, c(0, 0, 1, 1, 1, 1, 1, 2, 2, 2)) expect_warning( recode_into( x >= 3 & x <= 7 ~ 1, x > 5 ~ 2, default = 0, overwrite = FALSE ), regex = "case 6" ) }) test_that("recode_into, don't overwrite", { x <- 1:30 expect_warning( recode_into( x > 1 ~ "a", x > 10 & x <= 15 ~ "b", default = "c", overwrite = FALSE ), regex = "altered" ) }) test_that("recode_into, check mixed types", { x <- 1:10 expect_error( { out <- recode_into( x > 5 ~ 1, x > 2 & x <= 5 ~ "b" ) }, regexp = "Recoding not carried out" ) }) test_that("recode_into, complain about default = NULL", { x <- 1:10 expect_warning( { out <- recode_into( x > 5 ~ "c", x > 2 & x <= 5 ~ "b", default = NULL ) }, regexp = "Default value" ) expect_identical(out, c(NA, NA, "b", "b", "b", "c", "c", "c", "c", "c")) }) test_that("recode_into, data frame", { data(mtcars) out <- recode_into( mtcars$mpg > 20 & mtcars$cyl == 6 ~ 1, mtcars$mpg <= 20 ~ 2, default = 0 ) expect_identical( out, c( 1, 1, 0, 1, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0 ) ) d <- mtcars out <- recode_into( mpg > 20 & cyl == 6 ~ 1, mpg <= 20 ~ 2, default = 0, data = d ) expect_identical( out, c( 1, 1, 0, 1, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0 ) ) }) test_that("recode_into, works inside functions", { test <- function() { set.seed(123) d <- data.frame( x = sample.int(5, 30, TRUE), y = sample(letters[1:5], 30, TRUE), stringsAsFactors = FALSE ) recode_into( x %in% 1:3 & y %in% c("a", "b") ~ 1, x > 3 ~ 2, data = d, default = 0 ) } expect_identical( test(), c( 1, 1, 1, 0, 0, 2, 2, 0, 1, 1, 2, 0, 0, 0, 2, 1, 1, 2, 1, 0, 1, 1, 0, 2, 0, 1, 2, 2, 1, 2 ) ) }) test_that("recode_into, check differen input length", { x <- 1:10 y <- 10:30 expect_error( { out <- recode_into( x > 5 ~ 1, y > 10 ~ 2 ) }, regexp = "matching conditions" ) }) test_that("recode_into, check different input length", { x <- 1:5 y <- c(5, 2, 3, 1, 4) expect_warning( { out <- recode_into( x == 2 ~ 1, y == 2 & x == 2 ~ 2, default = 0 ) }, regexp = "Several recode patterns" ) }) test_that("recode_into, make sure recode works with missing in original variable", { data(mtcars) mtcars$mpg[c(3, 10, 12, 15, 16)] <- NA mtcars$cyl[c(2, 15, 16)] <- NA d_recode_na <<- as.data.frame(mtcars) out1_recoded_na <- recode_into( d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, d_recode_na$mpg <= 20 ~ 2, d_recode_na$cyl == 4 ~ 3, default = 0, preserve_na = TRUE ) out2_recoded_na <- recode_into( d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, d_recode_na$mpg <= 20 ~ 2, default = 0, preserve_na = TRUE ) expect_message( { out3_recoded_na <- recode_into( d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, d_recode_na$mpg <= 20 ~ 2, d_recode_na$cyl == 4 ~ 3, default = 0, preserve_na = FALSE ) }, regex = "Missing values in original variable" ) expect_message( { out4_recoded_na <- recode_into( d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, d_recode_na$mpg <= 20 ~ 2, default = 0, preserve_na = FALSE ) }, regex = "Missing values in original variable" ) # one NA in mpg is overwritten by valid value from cyl, total 5 NA expect_identical( out1_recoded_na, c( 1, NA, 3, 1, 2, 2, 2, 3, 3, NA, 2, NA, 2, 2, NA, NA, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3 ) ) # total 6 NA expect_identical( out2_recoded_na, c( 1, NA, NA, 1, 2, 2, 2, 0, 0, NA, 2, NA, 2, 2, NA, NA, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0 ) ) # NA is preserved, set to default if not overwritten by other recodes expect_identical( out3_recoded_na, c( 1, 0, 3, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 2, 0, 0, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3 ) ) expect_identical( out4_recoded_na, c( 1, 0, 0, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0 ) ) }) test_that("recode_into, NA doesn't need to be of exact type", { data(mtcars) x1 <- recode_into( mpg > 10 ~ 1, gear == 5 ~ NA_real_, data = mtcars, verbose = FALSE ) x2 <- recode_into( mpg > 10 ~ 1, gear == 5 ~ NA, data = mtcars, verbose = FALSE ) expect_identical(x1, x2) }) ================================================ FILE: tests/testthat/test-replace_nan_inf.R ================================================ test_that("extract from data frame", { x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7) expect_identical( replace_nan_inf(x), c(1, 2, NA, 3, NA, 4, NA, 5, NA, NA, 6, 7) ) # a data frame df <- data.frame( x = c(1, NA, 5, Inf, 2, NA), y = c(3, NaN, 4, -Inf, 6, 7), stringsAsFactors = FALSE ) expect_identical( replace_nan_inf(df), structure( list( x = c(1, NA, 5, NA, 2, NA), y = c(3, NA, 4, NA, 6, 7) ), row.names = c(NA, -6L), class = "data.frame" ) ) expect_identical( replace_nan_inf(df, select = starts_with("x")), structure( list( x = c(1, NA, 5, NA, 2, NA), y = c(3, NaN, 4, -Inf, 6, 7) ), row.names = c(NA, -6L), class = "data.frame" ) ) }) ================================================ FILE: tests/testthat/test-rescale_weights.R ================================================ test_that("rescale_weights works as expected", { data(nhanes_sample) # convert tibble into data frame, so check-hard GHA works nhanes_sample <- as.data.frame(nhanes_sample) expect_snapshot(head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA"))) expect_snapshot(head(rescale_weights( nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU") ))) expect_snapshot(head(rescale_weights( nhanes_sample, probability_weights = "WTINT2YR", method = "kish" ))) out <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") expect_equal(sum(out$rescaled_weights_a), 2992, tolerance = 1e-3) expect_equal(sum(out$rescaled_weights_b), 2244.71451, tolerance = 1e-3) out <- rescale_weights(nhanes_sample, "WTINT2YR", method = "kish") expect_equal(sum(out$rescaled_weights), 2162.53961, tolerance = 1e-3) out <- rescale_weights( nhanes_sample, "WTINT2YR", by = "SDMVPSU", method = "kish" ) expect_equal(sum(out$rescaled_weights), 2163.3657, tolerance = 1e-3) }) test_that("rescale_weights works as expected", { data(nhanes_sample) # convert tibble into data frame, so check-hard GHA works nhanes_sample <- as.data.frame(nhanes_sample)[1:20, ] # add NAs set.seed(123) nhanes_sample$WTINT2YR[sample.int(nrow(nhanes_sample), 5)] <- NA expect_snapshot(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")) expect_snapshot(rescale_weights(nhanes_sample, "WTINT2YR", method = "kish")) }) test_that("rescale_weights nested works as expected", { data(nhanes_sample) # convert tibble into data frame, so check-hard GHA works nhanes_sample <- as.data.frame(nhanes_sample) expect_snapshot( rescale_weights( data = head(nhanes_sample, n = 30), by = c("SDMVSTRA", "SDMVPSU"), probability_weights = "WTINT2YR", nest = TRUE ) ) expect_warning( { x <- rescale_weights( data = head(nhanes_sample), by = "SDMVPSU", probability_weights = "WTINT2YR", nest = TRUE ) }, "Only one group variable selected" ) expect_identical( x, rescale_weights( data = head(nhanes_sample), by = "SDMVPSU", probability_weights = "WTINT2YR" ) ) }) test_that("rescale_weights errors and warnings", { data(nhanes_sample) expect_error( rescale_weights( data = head(nhanes_sample, n = 30), by = c("a", "SDMVSTRA", "c"), probability_weights = "WTINT2YR" ), regex = "The following" ) expect_error( rescale_weights( data = head(nhanes_sample, n = 30), by = "SDMVSTRA", probability_weights = NULL ), regex = "is missing, but required" ) expect_error( rescale_weights( data = head(nhanes_sample, n = 30), by = NULL, probability_weights = "WTINT2YR" ), regex = "must be specified" ) expect_error( rescale_weights( data = head(nhanes_sample, n = 30), by = "abc", probability_weights = "WTINT2YR", method = "kish" ), regex = "The following variable" ) expect_warning( rescale_weights( data = head(nhanes_sample, n = 30), by = "SDMVSTRA", probability_weights = "WTINT2YR", nest = TRUE, method = "kish" ), regex = "is ignored" ) expect_error( rescale_weights( data = head(nhanes_sample, n = 30), probability_weights = "WTINT2YR", method = "dish" ), regex = "Invalid option for argument" ) nhanes_sample$rescaled_weights_a <- 1 expect_warning( { out <- rescale_weights( data = head(nhanes_sample, n = 30), by = "SDMVSTRA", probability_weights = "WTINT2YR" ) }, regex = "The variable name" ) expect_named( out, c( "total", "age", "RIAGENDR", "RIDRETH1", "SDMVPSU", "SDMVSTRA", "WTINT2YR", "rescaled_weights_a", "rescaled_weights_a_1", "rescaled_weights_b" ) ) }) ================================================ FILE: tests/testthat/test-reshape_ci.R ================================================ test_that("reshape_ci with single CI level", { df <- data.frame( Parameter = c("Term 1"), CI = c(0.8), CI_low = c(0.2), CI_high = c(0.5), stringsAsFactors = FALSE ) df_reshape <- reshape_ci(df) expect_snapshot(df_reshape) }) test_that("reshape_ci with multiple CI levels", { x <- data.frame( Parameter = c("Term 1", "Term 2", "Term 1", "Term 2"), CI = c(0.8, 0.8, 0.9, 0.9), CI_low = c(0.2, 0.3, 0.1, 0.15), CI_high = c(0.5, 0.6, 0.8, 0.85), stringsAsFactors = FALSE ) expect_snapshot(reshape_ci(x)) expect_snapshot(reshape_ci(reshape_ci(x))) }) ================================================ FILE: tests/testthat/test-row_count.R ================================================ test_that("row_count", { d_mn <- data.frame( c1 = c(1, 2, NA, 4), c2 = c(NA, 2, NA, 5), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, 8) ) expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0)) expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1)) d_mn <- data.frame( c1 = c("a", "b", NA, "c"), c2 = c(NA, "b", NA, "d"), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, Inf), stringsAsFactors = FALSE ) expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0)) expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) }) test_that("row_count, errors or messages", { data(iris) expect_error( expect_warning(row_count(iris, select = "abc")), regex = "must be a valid" ) expect_error( expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns" ) expect_error(row_count(iris[1], count = 3), regex = "with at least") expect_error( row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row" ) }) test_that("row_count, allow_coercion match", { d_mn <- data.frame( c1 = c("1", "2", NA, "3"), c2 = c(NA, "2", NA, "3"), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, Inf), stringsAsFactors = FALSE ) expect_identical( row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0) ) expect_identical( row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0) ) expect_identical( row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0) ) expect_identical( row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0) ) expect_error( row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has" ) # mix character / factor d_mn <- data.frame( c1 = factor(c("1", "2", NA, "3")), c2 = c("2", "1", NA, "3"), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, Inf), stringsAsFactors = FALSE ) expect_identical( row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0) ) expect_identical( row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0) ) expect_identical( row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0) ) expect_identical( row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0) ) expect_identical( row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0) ) expect_identical( row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0) ) }) ================================================ FILE: tests/testthat/test-row_means.R ================================================ test_that("row_means/sums", { d_mn <- data.frame( c1 = c(1, 2, NA, 4), c2 = c(NA, 2, NA, 5), c3 = c(NA, 4, NA, NA), c4 = c(2, 3, 7, 8) ) expect_equal( row_means(d_mn, min_valid = 4), c(NA, 2.75, NA, NA), tolerance = 1e-3 ) expect_equal( row_means(d_mn, min_valid = 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3 ) expect_equal( row_means(d_mn, min_valid = 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3 ) expect_equal( row_means(d_mn, min_valid = 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3 ) expect_equal( row_means(d_mn, min_valid = 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3 ) expect_equal( row_means(d_mn, min_valid = 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3 ) expect_equal( row_means(d_mn, min_valid = 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1 ) expect_message(row_means(iris), regex = "Only numeric") expect_equal( row_means(iris, verbose = FALSE), rowMeans(iris[, 1:4]), tolerance = 1e-3, ignore_attr = TRUE ) expect_equal( row_sums(d_mn, min_valid = 4), c(NA, 11, NA, NA), tolerance = 1e-3 ) expect_equal( row_sums(d_mn, min_valid = 3), c(NA, 11, NA, 17), tolerance = 1e-3 ) expect_message(row_sums(iris), regex = "Only numeric") }) test_that("row_means/sums, errors or messages", { data(iris) expect_error( expect_warning(row_means(iris, select = "abc")), regex = "No columns" ) expect_error( expect_warning(row_sums(iris, select = "abc")), regex = "No columns" ) expect_error(row_means(iris[1], min_valid = 1), regex = "two numeric") expect_error(row_means(iris, min_valid = 1:4), regex = "numeric value") expect_error(row_means(iris, min_valid = "a"), regex = "numeric value") expect_message(row_means(iris[1:3, ], min_valid = 3), regex = "Only numeric") expect_silent(row_means(iris[1:3, ], min_valid = 3, verbose = FALSE)) expect_error(row_sums(iris[1], min_valid = 1), regex = "two numeric") expect_message(row_sums(iris[1:3, ], min_valid = 3), regex = "Only numeric") expect_silent(row_sums(iris[1:3, ], min_valid = 3, verbose = FALSE)) }) ================================================ FILE: tests/testthat/test-select_nse.R ================================================ foo <- function(data, select = NULL, exclude = NULL, regex = FALSE) { .select_nse( select, data, exclude = exclude, regex = regex, ignore_case = FALSE ) } test_that(".select_nse needs data", { expect_error( foo(select = "Sepal.Length", data = NULL), regexp = "must be provided" ) }) test_that(".select_nse needs a data frame or something coercible to a dataframe", { expect_identical( foo(select = "Sepal.Length", data = as.matrix(head(iris))), "Sepal.Length" ) expect_error( foo(select = "Sepal.Length", data = list(1:3, 1:2)), regexp = "must be a data frame" ) }) test_that(".select_nse: arg 'select' works", { expect_identical( foo(iris, select = NULL), names(iris) ) expect_identical( foo(iris, Petal.Length), "Petal.Length" ) expect_identical( foo(iris, c("Petal.Length", "Sepal.Width")), c("Petal.Length", "Sepal.Width") ) expect_identical( foo(iris, c(3, 2)), c("Petal.Length", "Sepal.Width") ) expect_identical( foo(iris, 1:5), names(iris) ) expect_identical( foo(iris, is.numeric), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_identical( foo(iris, is.numeric()), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) expect_identical( extract_column_names(iris, sepal.length, ignore_case = TRUE), "Sepal.Length" ) expect_identical( foo(iris, starts_with("Petal")), c("Petal.Length", "Petal.Width") ) expect_identical( foo(iris, ends_with("Length")), c("Sepal.Length", "Petal.Length") ) expect_identical( foo(iris, contains("Length")), c("Sepal.Length", "Petal.Length") ) expect_identical( foo(iris, regex("Length$")), c("Sepal.Length", "Petal.Length") ) expect_identical( foo(iris, "Len", regex = TRUE), c("Sepal.Length", "Petal.Length") ) }) test_that(".select_nse: arg 'exclude' works", { expect_identical( foo(iris, exclude = c("Petal.Length", "Sepal.Width")), c("Sepal.Length", "Petal.Width", "Species") ) expect_identical( foo(iris, exclude = c(3, 2)), c("Sepal.Length", "Petal.Width", "Species") ) expect_identical( foo(iris, exclude = starts_with("Petal")), c("Sepal.Length", "Sepal.Width", "Species") ) expect_identical( foo(iris, exclude = ends_with("Length")), c("Sepal.Width", "Petal.Width", "Species") ) expect_identical( foo(iris, exclude = contains("Length")), c("Sepal.Width", "Petal.Width", "Species") ) expect_identical( foo(iris, exclude = regex("Length$")), c("Sepal.Width", "Petal.Width", "Species") ) }) test_that(".select_nse: args 'select' and 'exclude' at the same time", { expect_identical( foo(iris, select = contains("Length"), exclude = starts_with("Petal")), "Sepal.Length" ) expect_identical( foo(iris, select = contains("Length"), exclude = contains("Length")), character(0) ) }) test_that(".select_nse: misc", { iris2 <- iris[, 1:3] expect_identical( foo(iris, select = names(iris2)), c("Sepal.Length", "Sepal.Width", "Petal.Length") ) expect_identical( foo(iris, select = names(iris2)[2]), "Sepal.Width" ) }) test_that(".select_nse: works with function and namespace", { model <- lm(Petal.Length ~ Petal.Width, data = iris) out <- data_select( iris, insight::find_predictors(model, effects = "fixed", flatten = TRUE) ) expect_identical(out, iris["Petal.Width"]) fun <- function(x) { data_select(iris, x) } out <- fun(insight::find_predictors(model, effects = "fixed", flatten = TRUE)) expect_identical(out, iris["Petal.Width"]) x <- "Sepal.Length" out <- fun(insight::find_predictors(model, effects = "fixed", flatten = TRUE)) expect_identical(out, iris["Petal.Width"]) }) test_that(".select_nse: allow character vector with :", { data(mtcars) out <- data_select(mtcars, c("cyl:hp", "wt", "vs:gear")) expect_named(out, c("cyl", "disp", "hp", "wt", "vs", "am", "gear")) out <- data_select(mtcars, c("cyl:hp", "wta", "vs:gear")) expect_named(out, c("cyl", "disp", "hp", "vs", "am", "gear")) out <- data_select(mtcars, c("hp:cyl", "wta", "vs:gear")) expect_named(out, c("hp", "disp", "cyl", "vs", "am", "gear")) out <- data_select(mtcars, c("cyl:hq", "wt", "vs:gear")) expect_named(out, c("wt", "vs", "am", "gear")) expect_warning( center(mtcars, c("cyl:hp", "wta", "vs:gear"), verbose = TRUE), regex = "Did you mean \"wt\"" ) expect_warning( center(mtcars, c("cyl:hq", "wt", "vs:gear"), verbose = TRUE), regex = "Did you mean one of \"hp\"" ) }) ================================================ FILE: tests/testthat/test-skewness-kurtosis.R ================================================ test_that("skewness", { expect_equal(skewness(iris$Sepal.Length)[[1]], 0.314911, tolerance = 1e-3) expect_equal( skewness(iris$Sepal.Length, type = 1)[[1]], 0.3117531, tolerance = 1e-3 ) expect_equal( skewness(iris$Sepal.Length, type = 3)[[1]], 0.3086407, tolerance = 1e-3 ) }) test_that("kurtosis", { expect_equal(kurtosis(iris$Sepal.Length)[[1]], -0.552064, tolerance = 1e-3) expect_equal( kurtosis(iris$Sepal.Length, type = 1)[[1]], -0.5735679, tolerance = 1e-3 ) expect_equal( kurtosis(iris$Sepal.Length, type = 3)[[1]], -0.6058125, tolerance = 1e-3 ) }) test_that("kurtosis and skewness with bootstrapping", { skip_if_not_installed("boot") set.seed(123) expect_equal( skewness(iris$Sepal.Length, iterations = 100)[[2]], 0.1262203, tolerance = 1e-3 ) set.seed(123) expect_equal( kurtosis(iris$Sepal.Length, iterations = 100)[[2]], 0.1878741, tolerance = 1e-3 ) }) test_that("skewness works with data frames", { skip_if_not_installed("boot") set.seed(123) expect_snapshot(skewness(iris[, 1:4])) set.seed(123) expect_snapshot(skewness(iris[, 1:4], iterations = 100)) }) test_that("kurtosis works with data frames", { skip_if_not_installed("boot") set.seed(123) expect_snapshot(kurtosis(iris[, 1:4])) set.seed(123) expect_snapshot(kurtosis(iris[, 1:4], iterations = 100)) }) test_that("skewness works with matrices", { skip_if_not_installed("boot") set.seed(123) expect_snapshot(skewness(as.matrix(iris[, 1:4]))) set.seed(123) expect_snapshot(skewness(as.matrix(iris[, 1:4]), iterations = 100)) }) test_that("kurtosis works with matrices", { skip_if_not_installed("boot") set.seed(123) expect_snapshot(kurtosis(as.matrix(iris[, 1:4]))) set.seed(123) expect_snapshot(kurtosis(as.matrix(iris[, 1:4]), iterations = 100)) }) test_that("skewness uses type 1 if not enough obs for type 2", { expect_warning( test <- skewness(c(1, 2), type = "2"), "Need at least 3 complete obs" ) expect_equal(test, skewness(c(1, 2), type = "1")) }) test_that("kurtosis uses type 1 if not enough obs for type 2", { expect_warning( test <- kurtosis(c(1, 2, 3), type = "2"), "Need at least 4 complete obs" ) expect_equal(test, kurtosis(c(1, 2, 3), type = "1")) }) ================================================ FILE: tests/testthat/test-smoothness.R ================================================ test_that("smoothness works", { set.seed(123) x <- (-10:10)^3 + rnorm(21, 0, 100) expect_equal(smoothness(x)[[1]], 0.9030014, tolerance = 0.001) expect_equal(smoothness(x, method = "auto")[[1]], 1.750452, tolerance = 0.001) }) test_that("smoothness works with iterations", { skip_if_not_installed("boot") set.seed(123) x <- (-10:10)^3 + rnorm(21, 0, 100) expect_equal( smoothness(x, iterations = 100)[[1]], 0.9030014, tolerance = 0.001 ) expect_equal( smoothness(x, method = "auto", iterations = 100)[[1]], 1.750452, tolerance = 0.001 ) }) test_that("smoothness with lag works", { set.seed(123) x <- (-10:10)^3 + rnorm(21, 0, 100) expect_equal(smoothness(x, lag = 0.5)[[1]], 0.5859015, tolerance = 0.001) expect_error(smoothness(x, lag = 0), "'lag' cannot be that small.") }) test_that("smoothness works with data frames", { skip_if(getRversion() < "4.0.0") set.seed(123) expect_snapshot(smoothness(BOD)) }) ================================================ FILE: tests/testthat/test-standardize-data.R ================================================ # standardize.numeric ----------------------------------------------------- test_that("standardize.numeric", { x <- standardize(seq(0, 1, length.out = 100)) expect_equal(mean(x), 0, tolerance = 0.01) x <- standardize(seq(0, 1, length.out = 100), two_sd = TRUE) expect_equal(sd(x), 0.5, tolerance = 0.01) x <- standardize(seq(0, 1, length.out = 100), robust = TRUE) expect_equal(median(x), 0, tolerance = 0.01) x <- standardize(seq(0, 1, length.out = 100), robust = TRUE, two_sd = TRUE) expect_equal(mad(x), 0.5, tolerance = 0.01) expect_message(standardize(c(0, 0, 0, 1, 1))) x <- standardize(c(-1, 0, 1), reference = seq(3, 4, length.out = 100)) expect_equal(mean(x), -11.943, tolerance = 0.01) }) # standardize factor / Date ----------------------------------------------- test_that("standardize.numeric", { f <- factor(c("c", "a", "b")) expect_identical(standardize(f), f) expect_equal(standardize(f, force = TRUE), c(1, -1, 0), ignore_attr = TRUE) d <- as.Date(c("1989/08/06", "1989/08/04", "1989/08/05")) expect_identical(standardize(d), d) expect_equal(standardize(d, force = TRUE), c(1, -1, 0), ignore_attr = TRUE) }) # standardize.data.frame -------------------------------------------------- test_that("standardize.data.frame", { skip_if_not_installed("poorman") data(iris) x <- standardize(iris) expect_equal(mean(x$Sepal.Length), 0, tolerance = 0.01) expect_length(levels(x$Species), 3) expect_equal( mean(subset(x, Species == "virginica")$Sepal.Length), 0.90, tolerance = 0.01 ) # check class attributes expect_identical( vapply(x, class, character(1)), c( Sepal.Length = "numeric", Sepal.Width = "numeric", Petal.Length = "numeric", Petal.Width = "numeric", Species = "factor" ) ) x2 <- standardize(x = iris[1, ], reference = iris) expect_true(all(x2[1, ] == x[1, ])) x <- standardize(poorman::group_by(iris, Species)) expect_equal(mean(x$Sepal.Length), 0, tolerance = 0.01) expect_length(levels(x$Species), 3) expect_equal( mean(subset(x, Species == "virginica")$Sepal.Length), 0, tolerance = 0.01 ) }) test_that("standardize.data.frame, NAs", { skip_if_not_installed("poorman") data(iris) iris$Sepal.Width[c(148, 65, 33, 58, 54, 93, 114, 72, 32, 23)] <- NA iris$Sepal.Length[c(11, 30, 141, 146, 13, 149, 6, 8, 48, 101)] <- NA x <- standardize(iris) expect_equal( head(x$Sepal.Length), c(-0.9163, -1.1588, -1.4013, -1.5226, -1.0376, NA), tolerance = 0.01 ) expect_equal( head(x$Sepal.Width), c(1.0237, -0.151, 0.3189, 0.0839, 1.2586, 1.9635), tolerance = 0.01 ) expect_identical(mean(x$Sepal.Length), NA_real_) x <- standardize(iris, two_sd = TRUE) expect_equal( head(x$Sepal.Length), c(-0.4603, -0.5811, -0.7019, -0.7623, -0.5207, NA), tolerance = 0.01 ) expect_equal( head(x$Sepal.Width), c(0.5118, -0.0755, 0.1594, 0.042, 0.6293, 0.9817), tolerance = 0.01 ) expect_identical(mean(x$Sepal.Length), NA_real_) x <- standardize(poorman::group_by(iris, .data$Species)) expect_equal( head(x$Sepal.Length), c(0.2547, -0.3057, -0.8661, -1.1463, -0.0255, NA), tolerance = 0.01 ) expect_equal( head(x$Sepal.Width), c(0.2369, -1.0887, -0.5584, -0.8235, 0.502, 1.2974), tolerance = 0.01 ) expect_identical(mean(x$Sepal.Length), NA_real_) }) test_that("standardize.data.frame, apend", { skip_if_not_installed("poorman") data(iris) iris$Sepal.Width[c(26, 43, 56, 11, 66, 132, 23, 133, 131, 28)] <- NA iris$Sepal.Length[c(32, 12, 109, 92, 119, 49, 83, 113, 64, 30)] <- NA x <- standardize(iris, append = TRUE) expect_identical( colnames(x), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "Sepal.Length_z", "Sepal.Width_z", "Petal.Length_z", "Petal.Width_z" ) ) expect_equal( head(x$Sepal.Length_z), c(-0.8953, -1.1385, -1.3816, -1.5032, -1.0169, -0.5306), tolerance = 0.01 ) expect_equal( head(x$Sepal.Width_z), c(1.04, -0.1029, 0.3543, 0.1257, 1.2685, 1.9542), tolerance = 0.01 ) expect_identical(mean(x$Sepal.Length_z), NA_real_) x <- standardize(iris, two_sd = TRUE, append = TRUE) expect_equal( head(x$Sepal.Length_z), c(-0.4477, -0.5692, -0.6908, -0.7516, -0.5084, -0.2653), tolerance = 0.01 ) expect_equal( head(x$Sepal.Width_z), c(0.52, -0.0514, 0.1771, 0.0629, 0.6343, 0.9771), tolerance = 0.01 ) expect_identical(mean(x$Sepal.Length_z), NA_real_) x <- standardize(poorman::group_by(iris, .data$Species), append = TRUE) expect_equal( head(x$Sepal.Length_z), c(0.2746, -0.2868, -0.8483, -1.129, -0.0061, 1.1168), tolerance = 0.01 ) expect_equal( head(x$Sepal.Width_z), c(0.1766, -1.1051, -0.5924, -0.8487, 0.4329, 1.2019), tolerance = 0.01 ) expect_identical(mean(x$Sepal.Length_z), NA_real_) }) test_that("standardize.data.frame, weights", { skip_if_not_installed("poorman") x <- rexp(30) w <- rpois(30, 20) + 1 expect_equal( sqrt(cov.wt(cbind(x, x), w)$cov[1, 1]), attr(standardize(x, weights = w), "scale"), tolerance = 1e-4 ) expect_equal( standardize(x, weights = w), standardize(data.frame(x), weights = w)$x, tolerance = 1e-4, ignore_attr = TRUE ) # name and vector give same results expect_equal( standardize(mtcars, exclude = "cyl", weights = mtcars$cyl), standardize(mtcars, weights = "cyl"), tolerance = 1e-4 ) d <- poorman::group_by(mtcars, am) expect_warning(standardize(d, weights = d$cyl)) }) # Unstandardize ----------------------------------------------------------- test_that("unstandardize, numeric", { data(iris) x <- standardize(iris$Petal.Length) rez <- unstandardize(x) expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE) rez <- unstandardize(x, reference = iris$Petal.Length) expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE) rez <- unstandardize( x, center = mean(iris$Petal.Length), scale = stats::sd(iris$Petal.Length) ) expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE) rez <- unstandardize( 0, center = mean(iris$Petal.Length), scale = stats::sd(iris$Petal.Length) ) expect_equal(rez, mean(iris$Petal.Length), tolerance = 1e-3) x <- standardize(iris$Petal.Length, robust = TRUE, two_sd = TRUE) rez <- unstandardize(x, robust = TRUE, two_sd = TRUE) expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE) x <- scale(iris$Petal.Length) rez <- unstandardize(x) expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE) x <- scale(iris$Petal.Length, center = 3, scale = 2) rez <- unstandardize(x) expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE) }) test_that("unstandardize, data frame", { skip_if_not_installed("poorman") data(iris) x <- standardize(iris) rez <- unstandardize(x) expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE) x <- standardize(iris, select = "Petal.Length") rez <- unstandardize(x) expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE) x <- standardize(iris, select = starts_with("Pet")) rez <- unstandardize(x, select = starts_with("Pet")) expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE) x <- standardize(iris, select = "Petal.Length") rez <- unstandardize( x, center = c(Petal.Length = mean(iris$Petal.Length)), scale = c(Petal.Length = stats::sd(iris$Petal.Length)) ) expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE) expect_error(unstandardize( x, center = mean(iris$Petal.Length), scale = stats::sd(iris$Petal.Length) )) x <- standardize(iris) rez <- unstandardize(x, center = rep(0, 4), scale = rep(1, 4)) expect_equal(rez, x, tolerance = 0.1, ignore_attr = TRUE) data(iris) x <- standardize(iris, robust = TRUE, two_sd = TRUE) rez <- unstandardize(x, robust = TRUE, two_sd = TRUE) expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE) }) test_that("un/standardize, matrix", { set.seed(4) x <- matrix(sample(8), nrow = 4) colnames(x) <- letters[1:2] rownames(x) <- LETTERS[1:4] z1 <- standardize(x) z2 <- scale(x) expect_equal(z1, z2, ignore_attr = TRUE) expect_equal(unstandardize(z1), x, ignore_attr = TRUE) expect_identical(unstandardize(z2), unstandardize(z1)) }) test_that("unstandardize with reference (data frame)", { x <- standardize(x = iris, reference = iris) x2 <- unstandardize(x, reference = iris) expect_equal(x2, iris, ignore_attr = TRUE) x <- standardize(x = iris, reference = iris, robust = TRUE) x2 <- unstandardize(x, reference = iris, robust = TRUE) expect_equal(x2, iris, ignore_attr = TRUE) }) test_that("unstandardize does nothing with characters and factors", { expect_identical( unstandardise(c("a", "b")), c("a", "b") ) expect_identical( unstandardise(factor(c(1, 2))), factor(c(1, 2)) ) }) # select helpers ------------------------------ test_that("standardize regex", { expect_identical( standardize(mtcars, select = "pg", regex = TRUE), standardize(mtcars, select = "mpg") ) }) # standardize when only providing one of center/scale --------------- test_that("standardize when only providing one of center/scale", { x <- 1:10 expect_identical( as.vector(datawizard::standardize(x, center = FALSE)), x / sd(x) ) expect_identical( as.vector(datawizard::standardize(x, center = 2)), (x - 2) / sd(x) ) expect_identical( as.vector(datawizard::standardize(x, scale = FALSE)), as.vector(datawizard::center(x)) ) expect_identical( as.vector(datawizard::standardize(x, scale = 1.5)), (x - mean(x)) / 1.5 ) }) # grouped data test_that("unstandardize: grouped data", { skip_if_not_installed("poorman") # 1 group, 1 standardized var stand <- poorman::group_by(mtcars, cyl) stand <- standardize(stand, "mpg") unstand <- unstandardize(stand, select = "mpg") expect_identical( poorman::ungroup(unstand), mtcars, ignore_attr = TRUE ) expect_s3_class(unstand, "grouped_df") # 2 groups, 1 standardized var set.seed(123) test <- iris test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE) stand <- poorman::group_by(test, Species, grp) stand <- standardize(stand, "Sepal.Length") expect_identical( poorman::ungroup(unstandardize(stand, select = "Sepal.Length")), test ) # 2 groups, 2 standardized vars set.seed(123) test <- iris test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE) stand <- poorman::group_by(test, Species, grp) stand <- standardize(stand, c("Sepal.Length", "Petal.Length")) expect_identical( poorman::ungroup(unstandardize( stand, select = c("Sepal.Length", "Petal.Length") )), test ) expect_s3_class(unstand, "grouped_df") # can't recover attributes stand <- poorman::group_by(iris, Species) stand <- standardize(stand, "Sepal.Length") attr(stand, "groups") <- NULL expect_error( unstandardize(stand, "Sepal.Length"), regexp = "Couldn't retrieve the necessary information" ) # normalize applied on grouped data but unstandardize applied on ungrouped data stand <- poorman::group_by(mtcars, cyl) stand <- standardize(stand, "mpg") stand <- poorman::ungroup(stand) expect_error( unstandardize(stand, "mpg"), regexp = "must provide the arguments" ) # standardize applied on grouped data but unstandardize applied different grouped # data stand <- poorman::group_by(stand, am) expect_error( unstandardize(stand, "mpg"), regexp = "Couldn't retrieve the necessary" ) }) ================================================ FILE: tests/testthat/test-standardize_datagrid.R ================================================ # standardize ----------------------------------------------------- test_that("standardize.datagrid", { x <- insight::get_datagrid( iris, by = "Sepal.Length", range = "sd", length = 3 ) out <- standardize(x) expect_identical(as.numeric(out$Sepal.Length), c(-1, 0, 1), tolerance = 1e-3) expect_identical(as.numeric(out$Sepal.Width), c(0, 0, 0), tolerance = 1e-3) x <- insight::get_datagrid(iris, by = "Sepal.Length = c(-1, 0)") out <- unstandardize(x, select = "Sepal.Length") expect_identical( out$Sepal.Length[1:2], c(mean(iris$Sepal.Length) - sd(iris$Sepal.Length), mean(iris$Sepal.Length)), tolerance = 1e-3 ) }) ================================================ FILE: tests/testthat/test-standardize_models.R ================================================ # standardize.lm ---------------------------------------------------------- test_that("standardize.lm", { iris2 <- na.omit(iris) iris_z <- standardize(iris2) m0 <- lm(Sepal.Length ~ Species * Petal.Width, data = iris_z) m1 <- lm(Sepal.Length ~ Species * Petal.Width, data = iris2) model <- standardize(m1) expect_identical(coef(m0), coef(model)) }) test_that("standardize, mlm", { m <- lm(cbind(mpg, hp) ~ cyl + am, data = mtcars) m2 <- lm(scale(cbind(mpg, hp)) ~ scale(cyl) + scale(am), data = mtcars) mz <- standardize(m) expect_equal(coef(mz), coef(m2), ignore_attr = TRUE, tolerance = 1e-4) }) test_that("standardize | errors", { my_lm_external_formula <- function(.dat, predicted, predictor) { my_formula <- as.formula(paste0(predicted, "~", predictor)) lm(formula = my_formula, data = .dat) } m <- my_lm_external_formula(mtcars, "mpg", "am") ers <- capture_error(standardize(m)) expect_match( as.character(ers), "Try instead to standardize the data", fixed = TRUE ) }) test_that("standardize | problematic formulas", { data(mtcars) m <- lm(mpg ~ hp, data = mtcars) expect_equal( coef(standardise(m)), c(`(Intercept)` = -3.14935717633686e-17, hp = -0.776168371826586), tolerance = 1e-4 ) colnames(mtcars)[1] <- "1_mpg" m <- lm(`1_mpg` ~ hp, data = mtcars) expect_error(standardise(m), regex = "Looks like") # works interactive only # data(mtcars) # m <- lm(mtcars$mpg ~ mtcars$hp) # expect_error(standardise(m), regex = "model formulas") m <- lm(mtcars[, 1] ~ hp, data = mtcars) expect_error(standardise(m), regex = "indexed data") }) # Transformations --------------------------------------------------------- test_that("transformations", { skip_if_not_installed("effectsize") # deal with log / sqrt terms expect_message(standardize(lm(mpg ~ sqrt(cyl) + log(hp), mtcars))) expect_message(standardize(lm(mpg ~ sqrt(cyl), mtcars))) expect_message(standardize(lm(mpg ~ log(hp), mtcars))) # difference between stand-methods: mt <- mtcars mt$hp_100 <- mt$hp / 100 fit_exp <- lm(mpg ~ exp(hp_100), mt) fit_scale1 <- lm(scale(mpg) ~ exp(scale(hp_100)), mt) fit_scale2 <- lm(scale(mpg) ~ scale(exp(hp_100)), mt) expect_equal( effectsize::standardize_parameters(fit_exp, method = "refit")[2, 2], unname(coef(fit_scale1)[2]), ignore_attr = TRUE ) expect_equal( effectsize::standardize_parameters(fit_exp, method = "basic")[2, 2], unname(coef(fit_scale2)[2]), ignore_attr = TRUE ) # fmt: skip d <- data.frame( time = factor(rep(c("1", "2", "3", "4", "5"), 6)), group = rep(rep(c(1, 2), 3), each = 5L), sum = rep(c(0, 5, 10, 15, 20, 0, 20, 25, 45, 50), 3) ) m <- lm(log(sum + 1) ~ as.numeric(time) * group, data = d) expect_message({ out <- standardize(m) }) expect_identical( coef(m), c( `(Intercept)` = -0.4575, `as.numeric(time)` = 0.5492, group = 0.3379, `as.numeric(time):group` = 0.15779 ), tolerance = 0.01 ) }) # W/ weights -------------------------------------------------------------- test_that("weights", { expect_warning(standardize(mtcars, weights = "xx")) m <- lm(mpg ~ wt + hp, weights = cyl, mtcars) sm <- standardize(m, weights = TRUE) sm_data <- insight::get_data(sm, source = "frame") sm_data2 <- standardize( mtcars, select = c("mpg", "wt", "hp"), weights = "cyl" ) expect_identical( sm_data[, c("mpg", "wt", "hp")], sm_data2[, c("mpg", "wt", "hp")] ) expect_error(standardize(m, weights = TRUE, robust = TRUE), NA) # no weights in stding sm_xw <- standardize(m, weights = FALSE) sm_data_xw <- insight::get_data(sm_xw, source = "frame") expect_false(isTRUE(all.equal(coef(sm)[-1], coef(sm_xw)[-1]))) skip_if_not_installed("effectsize") # refit and posthoc should give same results stdREFIT <- effectsize::standardize_parameters(m, method = "refit") expect_equal( stdREFIT[[2]], effectsize::standardize_parameters(m, method = "posthoc")[[2]], ignore_attr = TRUE ) expect_equal( stdREFIT[[2]], effectsize::standardize_parameters(m, method = "basic")[[2]], ignore_attr = TRUE ) }) # weights + missing data -------------------------------------------------- test_that("weights + NA", { set.seed(1234) data(iris) # data setup iris$weight_me <- runif(nrow(iris)) iris$Sepal.Length[sample(nrow(iris), size = 10)] <- NA iris$weight_me[sample(nrow(iris), size = 10)] <- NA # standardize 2nd data set iris2 <- standardize( iris, select = c("Sepal.Length", "Petal.Width"), remove_na = "all" ) iris3 <- standardize( iris, select = c("Sepal.Length", "Petal.Width"), weights = "weight_me", remove_na = "selected" ) m1 <- lm( Sepal.Length ~ Species + Petal.Width, data = iris, weights = weight_me ) # weights, missing data, but data isn't weight-stdized m2 <- lm( Sepal.Length ~ Species + Petal.Width, data = iris2, weights = weight_me ) sm2 <- standardize(m1, weights = FALSE) expect_identical(coef(m2), coef(sm2)) # weights, missing data, and data is weight-stdized m3 <- lm( Sepal.Length ~ Species + Petal.Width, data = iris3, weights = weight_me ) sm3 <- standardize(m1, weights = TRUE) expect_identical(coef(m3), coef(sm3)) }) # weights + missing data ´+ na.action = na.exclude -------------------------------------------------- test_that("weights + NA + na.exclude", { skip_if_not_installed("effectsize") set.seed(1234) data(iris) # data setup iris$weight_me <- runif(nrow(iris)) iris$Sepal.Length[sample(nrow(iris), size = 25)] <- NA iris$weight_me[sample(nrow(iris), size = 15)] <- NA d <- iris m1 <- lm( Sepal.Length ~ Species + Petal.Width, data = d, weights = weight_me, na.action = na.exclude ) m2 <- lm(Sepal.Length ~ Species + Petal.Width, data = d, weights = weight_me) expect_identical( coef(standardize(m2)), coef(standardize(m1)), tolerance = 1e-3 ) expect_identical( effectsize::standardize_parameters(m1, method = "basic")[[2]], effectsize::standardize_parameters(m2, method = "basic")[[2]], tolerance = 1e-3 ) }) # subset ------------------ test_that("fail with subset", { data("mtcars") mod1 <- lm(mpg ~ hp, data = mtcars, subset = cyl > 4) expect_error(standardise(mod1), regexp = "subset") }) # don't standardize non-Gaussian response ------------------------------------ test_that("standardize non-Gaussian response", { skip_on_cran() skip_if_not_installed("lme4") set.seed(1234) data(sleepstudy, package = "lme4") m1 <- glm(Reaction ~ Days, family = Gamma(), data = sleepstudy) m2 <- glm( Reaction ~ Days, family = Gamma(link = "identity"), data = sleepstudy ) m3 <- glm(Reaction ~ Days, family = inverse.gaussian(), data = sleepstudy) expect_identical( coef(standardize(m1)), c(`(Intercept)` = 0.00338, Days = -0.00034), tolerance = 1e-2 ) expect_identical( coef(standardize(m2)), c(`(Intercept)` = 298.48571, Days = 29.70754), tolerance = 1e-3 ) expect_identical( coef(standardize(m3)), c(`(Intercept)` = 1e-05, Days = 0), tolerance = 1e-3 ) }) # variables evaluated in the environment $$$ ------------------------------ test_that("variables evaluated in the environment", { m <- lm(mtcars$mpg ~ mtcars$cyl + am, data = mtcars) w <- capture_error(standardize(m)) expect_true(any(grepl("Using `$`", w, fixed = TRUE))) ## Note: # No idea why this is suddenly not giving a warning on older R versions. m <- lm(mtcars$mpg ~ mtcars$cyl + mtcars$am, data = mtcars) w <- capture_error(standardize(m)) expect_true(any(grepl("Using `$`", w, fixed = TRUE))) }) # mediation models -------------------------------------------------------- test_that("standardize mediation", { skip_on_cran() skip_if_not_installed("mediation") set.seed(444) data(jobs, package = "mediation") jobs$econ_hard <- jobs$econ_hard * 20 b.int <- lm(job_seek ~ treat * age + econ_hard + sex, data = jobs) d.int <- lm(depress2 ~ treat * job_seek * age + econ_hard + sex, data = jobs) med1 <- mediation::mediate( b.int, d.int, sims = 200, treat = "treat", mediator = "job_seek" ) med2 <- mediation::mediate( b.int, d.int, sims = 200, treat = "treat", mediator = "job_seek", covariates = list(age = mean(jobs$age)) ) out1 <- summary(standardize(med1)) expect_message({ out2 <- summary(standardize(med2)) }) expect_identical( unlist(out1[c("d0", "d1", "z0", "z1", "n0", "n1", "tau.coef")]), unlist(out2[c("d0", "d1", "z0", "z1", "n0", "n1", "tau.coef")]), tolerance = 0.1 ) med0 <- mediation::mediate( standardize(b.int), standardize(d.int), sims = 200, treat = "treat", mediator = "job_seek" ) out0 <- summary(med0) medz <- standardize(mediation::mediate( b.int, d.int, sims = 200, treat = "treat", mediator = "job_seek" )) outz <- summary(medz) expect_identical( unlist(out0[c("d0", "d1", "z0", "z1", "n0", "n1", "tau.coef")]), unlist(outz[c("d0", "d1", "z0", "z1", "n0", "n1", "tau.coef")]), tolerance = 0.1 ) }) # Offsets ----------------------------------------------------------------- test_that("offsets", { skip_if_not_installed("effectsize") skip_if_not_installed("parameters") m <- lm(mpg ~ hp + offset(wt), data = mtcars) expect_warning({ mz1 <- standardize(m) }) expect_warning({ mz2 <- standardize(m, two_sd = TRUE) }) expect_identical(c(1, 2) * coef(mz1), coef(mz2)) m <- glm(cyl ~ hp + offset(wt), family = poisson(), data = mtcars) expect_warning( { mz <- standardize(m) }, regexp = NA ) par1 <- parameters::model_parameters(mz) par2 <- effectsize::standardize_parameters(m, method = "basic") expect_identical(par2[2, 2], par1[2, 2], tolerance = 0.05) }) # BRMS -------------------------------------------------------------------- test_that("brms", { skip_on_cran() skip_on_os(c("windows", "mac")) skip_if_not_installed("brms") skip_if_not_installed("RcppEigen") skip_if_not_installed("BH") invisible( capture.output({ mod <- brms::brm( mpg ~ hp, data = mtcars, refresh = 0, chains = 1, silent = 2 ) }) ) expect_warning( standardize(mod), regexp = "without adjusting priors may lead to bogus" ) }) # fixest -------------------------------------------------------------------- test_that("fixest", { skip_if_not_installed("fixest") mtcars_stand <- standardize(mtcars) orig <- fixest::feols( drat ~ mpg + hp^2 | cyl + am, data = mtcars, se = "hetero" ) # TODO: Remove this suppressWarnings() when a new version of `fixest` that # contains the fix for https://github.com/lrberge/fixest/issues/618 is on CRAN # (CRAN version is 0.13.2 at the time of writing). suppressWarnings({ auto_stand <- standardize(orig) }) manual_stand <- fixest::feols( drat ~ mpg + hp^2 | cyl + am, data = mtcars_stand, se = "hetero" ) # Need to unname because I(hp^2) in the manual one becomes I(I(hp ^2)) in the # automated one. expect_identical( unname(auto_stand$coefficients), unname(manual_stand$coefficients) ) expect_identical(unname(auto_stand$se), unname(manual_stand$se)) ### Inform the user if some terms are log() or sqrt() orig <- fixest::feols( drat ~ log(mpg) | cyl + am, data = mtcars ) # TODO: same as above expect_message( suppressWarnings(standardize(orig)), "Formula contains log- or sqrt-terms" ) orig <- fixest::feols( drat ~ sqrt(mpg) | cyl + am, data = mtcars ) # TODO: same as above expect_message( suppressWarnings(standardize(orig)), "Formula contains log- or sqrt-terms" ) }) ================================================ FILE: tests/testthat/test-std_center.R ================================================ d <- iris[1:4, ] # standardize ----------------------------------------------------- test_that("standardize.data.frame", { x <- standardise(d, select = c("Sepal.Length", "Sepal.Width")) expect_equal( as.vector(x$Sepal.Length), as.vector(scale(d$Sepal.Length)), tolerance = 0.001 ) expect_equal( as.vector(x$Petal.Length), as.vector(d$Petal.Length), tolerance = 0.001 ) expect_equal( colnames(x), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) x <- standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = TRUE) expect_equal( as.vector(x$Sepal.Length_z), as.vector(scale(d$Sepal.Length)), tolerance = 0.001 ) expect_equal( as.vector(x$Sepal.Length), as.vector(d$Sepal.Length), tolerance = 0.001 ) expect_equal( as.vector(x$Petal.Length), as.vector(d$Petal.Length), tolerance = 0.001 ) expect_equal( colnames(x), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "Sepal.Length_z", "Sepal.Width_z" ) ) }) test_that("standardize other classes", { d <- data.frame( a = 1:5, b = factor(letters[1:5]), c = as.Date(c( "2022-03-22", "2022-01-02", "2022-02-02", "2021-04-02", "2020-01-19" )), d = c(TRUE, TRUE, FALSE, FALSE, TRUE), e = as.complex(1:5) ) x <- standardize(d$a) expect_equal( as.numeric(x), c(-1.26491, -0.63246, 0, 0.63246, 1.26491), tolerance = 1e-3, ignore_attr = TRUE ) x <- standardize(d$b) expect_equal(as.numeric(x), 1:5, tolerance = 1e-3, ignore_attr = TRUE) x <- standardize(d$b, force = TRUE) expect_equal( as.numeric(x), c(-1.26491, -0.63246, 0, 0.63246, 1.26491), tolerance = 1e-3, ignore_attr = TRUE ) x <- standardize(d$c) expect_equal( x, as.Date( c( "2022-03-22", "2022-01-02", "2022-02-02", "2021-04-02", "2020-01-19" ) ), tolerance = 1e-3, ignore_attr = TRUE ) x <- standardize(d$c, force = TRUE) expect_equal( as.numeric(x), c(0.76992, 0.53121, 0.62488, -0.29975, -1.62626), tolerance = 1e-3, ignore_attr = TRUE ) x <- standardize(d$d) expect_equal( x, c(TRUE, TRUE, FALSE, FALSE, TRUE), tolerance = 1e-3, ignore_attr = TRUE ) expect_message(x <- standardize(d$d, force = TRUE)) expect_equal( x, c(0.7303, 0.7303, -1.09545, -1.09545, 0.7303), tolerance = 1e-3, ignore_attr = TRUE ) ## TODO conflict with standardize.default() in effectsize expect_warning( x <- standardize(d$e), "cannot be standardized" ) expect_equal(x, d$e, tolerance = 1e-3, ignore_attr = TRUE) }) # center ----------------------------------------------------- test_that("center.data.frame", { x <- center(d, select = c("Sepal.Length", "Sepal.Width")) expect_equal( as.vector(x$Sepal.Length), as.vector(d$Sepal.Length - mean(d$Sepal.Length)), tolerance = 0.001 ) expect_equal( as.vector(x$Petal.Length), as.vector(d$Petal.Length), tolerance = 0.001 ) expect_equal( colnames(x), c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") ) x <- center(d, select = c("Sepal.Length", "Sepal.Width"), append = TRUE) expect_equal( as.vector(x$Sepal.Length_c), as.vector(d$Sepal.Length - mean(d$Sepal.Length)), tolerance = 0.001 ) expect_equal( as.vector(x$Sepal.Length), as.vector(d$Sepal.Length), tolerance = 0.001 ) expect_equal( as.vector(x$Petal.Length), as.vector(d$Petal.Length), tolerance = 0.001 ) expect_equal( colnames(x), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "Sepal.Length_c", "Sepal.Width_c" ) ) }) test_that("center other classes", { d <- data.frame( a = 1:5, b = factor(letters[1:5]), c = as.Date(c( "2022-03-22", "2022-01-02", "2022-02-02", "2021-04-02", "2020-01-19" )), d = c(TRUE, TRUE, FALSE, FALSE, TRUE), e = as.complex(1:5) ) x <- center(d$a) expect_equal( as.numeric(x), c(-2, -1, 0, 1, 2), tolerance = 1e-3, ignore_attr = TRUE ) x <- center(d$b) expect_equal(as.numeric(x), 1:5, tolerance = 1e-3, ignore_attr = TRUE) x <- center(d$b, force = TRUE) expect_equal( as.numeric(x), c(-2, -1, 0, 1, 2), tolerance = 1e-3, ignore_attr = TRUE ) x <- center(d$c) expect_equal( x, as.Date( c( "2022-03-22", "2022-01-02", "2022-02-02", "2021-04-02", "2020-01-19" ) ), tolerance = 1e-3, ignore_attr = TRUE ) x <- center(d$c, force = TRUE) expect_equal( as.numeric(x), c(254.8, 175.8, 206.8, -99.2, -538.2), tolerance = 1e-3, ignore_attr = TRUE ) x <- center(d$d) expect_equal( x, c(TRUE, TRUE, FALSE, FALSE, TRUE), tolerance = 1e-3, ignore_attr = TRUE ) expect_message(x <- center(d$d, force = TRUE)) expect_equal( x, c(0.4, 0.4, -0.6, -0.6, 0.4), tolerance = 1e-3, ignore_attr = TRUE ) expect_message(x <- center(d$e)) expect_equal(x, d$e, tolerance = 1e-3, ignore_attr = TRUE) }) ================================================ FILE: tests/testthat/test-std_center_scale_args.R ================================================ d <- data.frame(a = 1:5, b = 21:25, c = 41:45) test_that("standardize", { x <- standardize(d) expect_equal(as.vector(x$a), as.vector(scale(d$a)), tolerance = 0.001) }) test_that("standardize", { x <- standardize(d, center = 5, scale = 2) expect_equal(as.vector(x$a), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001) }) test_that("standardize", { x <- standardize(d, center = c(5, 25, 45), scale = c(3, 3, 3)) expect_equal( as.vector(x$a), c(-1.33333, -1, -0.66667, -0.33333, 0), tolerance = 0.001 ) expect_equal( as.vector(x$b), c(-1.33333, -1, -0.66667, -0.33333, 0), tolerance = 0.001 ) }) test_that("standardize", { x <- standardize(d, center = c(c = 45, a = 5, b = 25), scale = c(3, 3, 3)) expect_equal( as.vector(x$a), c(-1.33333, -1, -0.66667, -0.33333, 0), tolerance = 0.001 ) expect_equal( as.vector(x$b), c(-1.33333, -1, -0.66667, -0.33333, 0), tolerance = 0.001 ) }) test_that("standardize", { x <- standardize(d, center = c(c = 45, a = 5, b = 25), scale = c(1, 2, 3)) expect_equal(as.vector(x$a), c(-4, -3, -2, -1, 0), tolerance = 0.001) expect_equal(as.vector(x$b), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001) }) test_that("standardize", { x <- standardize( d, center = c(c = 45, a = 5, b = 25), scale = c(a = 1, b = 2, c = 3) ) expect_equal(as.vector(x$a), c(-4, -3, -2, -1, 0), tolerance = 0.001) expect_equal(as.vector(x$b), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001) }) test_that("standardize", { x <- standardize( d, center = c(c = 45, a = 5, b = 25), scale = c(c = 3, b = 2, a = 1) ) expect_equal(as.vector(x$a), c(-4, -3, -2, -1, 0), tolerance = 0.001) expect_equal(as.vector(x$b), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001) }) test_that("standardize", { x <- standardize( d, center = c(c = 45, a = 5, b = 25), scale = c(c = 1, b = 2, a = 3) ) expect_equal( as.vector(x$a), c(-1.33333, -1, -0.66667, -0.33333, 0), tolerance = 0.001 ) expect_equal(as.vector(x$b), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001) }) ================================================ FILE: tests/testthat/test-text_format.R ================================================ test_that("text formatting helpers work as expected", { expect_snapshot(text_format( c( "A very long First", "Some similar long Second", "Shorter Third", "More or less long Fourth", "And finally the Last" ), width = 20 )) expect_snapshot(text_format( c( "A very long First", "Some similar long Second", "Shorter Third", "More or less long Fourth", "And finally the Last" ), last = " or ", enclose = "`", width = 20 )) expect_identical( text_fullstop(c("something", "something else.")), c("something.", "something else.") ) expect_identical( text_lastchar(c("ABC", "DEF"), n = 2), c("BC", "EF"), ignore_attr = TRUE ) expect_identical( text_concatenate(c("First", "Second")), "First and Second" ) expect_identical( text_concatenate("First"), "First" ) expect_identical( text_concatenate(c("First", "Second", "Last")), "First, Second and Last" ) expect_identical( text_concatenate( c("First", "Second", "Last"), last = " or ", enclose = "`" ), "`First`, `Second` or `Last`" ) expect_identical( text_remove(c("one!", "two", "three!"), "!"), c("one", "two", "three") ) expect_identical( text_paste(c("A", "", "B"), c("42", "42", "42")), c("A, 42", "42", "B, 42") ) expect_identical( text_paste(c("A", "", "B"), c("42", "42", "42"), enclose = "`"), c("`A`, `42`", "`42`", "`B`, `42`") ) }) test_that("text formatters respect `width` argument", { expect_snapshot({ long_text <- strrep("abc ", 100) cat(text_format(long_text, width = 50)) cat(text_format(long_text, width = 80)) withr::with_options(list(width = 50), code = { cat(text_format(long_text)) }) }) }) ================================================ FILE: tests/testthat/test-unnormalize.R ================================================ test_that("unnormalize work as expected", { x <- normalize(c(0, 1, 5, -5, -2)) expect_equal( unnormalize(x), c(0, 1, 5, -5, -2), ignore_attr = TRUE ) expect_error( unnormalize(c(0, 1, 5, -5, -2)), "Can't unnormalize variable" ) }) test_that("unnormalize error if not supported", { expect_error( unnormalize(c("a", "b")), "can't be unnormalized" ) }) test_that("unnormalize and unstandardized x 4", { set.seed(123) x <- rnorm(6, 4, 10) z <- standardise(x) expect_named(attributes(z), c("center", "scale", "robust", "class")) expect_equal(attributes(z)$center, 8.47, tolerance = 0.01) expect_equal(unstandardise(z), x, ignore_attr = TRUE) z <- center(x) expect_named(attributes(z), c("center", "scale", "robust", "class")) expect_equal(unstandardise(z), x, ignore_attr = TRUE) z <- normalize(x) expect_named( attributes(z), c( "include_bounds", "flag_bounds", "min_value", "vector_length", "range_difference", "class" ) ) expect_equal(unnormalize(z), x, ignore_attr = TRUE) z <- change_scale(x, to = c(-3, 14.5)) expect_named( attributes(z), c( "min_value", "max_value", "new_min", "new_max", "range_difference", "to_range", "class" ) ) expect_equal(unnormalize(z), x, ignore_attr = TRUE) z <- change_scale(x, range = c(-100, 100)) expect_named( attributes(z), c( "min_value", "max_value", "new_min", "new_max", "range_difference", "to_range", "class" ) ) expect_equal(unnormalize(z), x, ignore_attr = TRUE) }) # select helpers ------------------------------ test_that("unnormalize regex", { x <- normalize(mtcars, select = "mpg") expect_identical( unnormalize(x, select = "pg", regex = TRUE), unnormalize(x, select = "mpg") ) }) test_that("unnormalize: grouped data", { skip_if_not_installed("poorman") # 1 group, 1 normalized var norm <- poorman::group_by(mtcars, cyl) norm <- normalize(norm, "mpg") expect_identical( poorman::ungroup(unnormalize(norm, "mpg")), mtcars, ignore_attr = TRUE # unnormalize removed rownames ) # 2 groups, 1 normalized var set.seed(123) test <- iris test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE) norm <- poorman::group_by(test, Species, grp) norm <- normalize(norm, "Sepal.Length") expect_identical( poorman::ungroup(unnormalize(norm, "Sepal.Length")), test ) # 2 groups, 2 normalized vars set.seed(123) test <- iris test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE) norm <- poorman::group_by(test, Species, grp) norm <- normalize(norm, c("Sepal.Length", "Petal.Length")) unnorm <- unnormalize(norm, c("Sepal.Length", "Petal.Length")) expect_identical( poorman::ungroup(unnorm), test ) expect_s3_class(unnorm, "grouped_df") # can't recover attributes norm <- poorman::group_by(iris, Species) norm <- normalize(norm, "Sepal.Length") attr(norm, "groups") <- NULL expect_error( unnormalize(norm, "Sepal.Length"), regexp = "Couldn't retrieve the necessary information" ) # normalize applied on grouped data but unnormalize applied on ungrouped data norm <- poorman::group_by(mtcars, cyl) norm <- normalize(norm, "mpg") norm <- poorman::ungroup(norm) expect_error( unnormalize(norm, "mpg"), regexp = "Can't unnormalize variable" ) # normalize applied on grouped data but unnormalize applied different grouped # data norm <- poorman::group_by(norm, am) expect_error( unnormalize(norm, "mpg"), regexp = "Couldn't retrieve the necessary" ) }) ================================================ FILE: tests/testthat/test-utils.R ================================================ test_that(".coerce_to_dataframe works for matrices", { mat <- matrix(c(1, 2, 3, 11, 12, 13), nrow = 2, ncol = 3, byrow = TRUE) expect_equal( .coerce_to_dataframe(mat), data.frame( V1 = c(1, 11), V2 = c(2, 12), V3 = c(3, 13) ) ) }) test_that(".coerce_to_dataframe works for vectors and list", { expect_equal( .coerce_to_dataframe(1:3), data.frame(data = 1:3) ) expect_equal( .coerce_to_dataframe(c("a", "b", "c")), data.frame(data = c("a", "b", "c"), stringsAsFactors = FALSE) ) expect_equal( .coerce_to_dataframe(list(var1 = 1:3, var2 = 4:6)), data.frame(var1 = 1:3, var2 = 4:6) ) }) test_that(".coerce_to_dataframe errors correctly if can't coerce", { expect_error( .coerce_to_dataframe(list(var1 = 1:3, var2 = 4:5)), regexp = "object that can be coerced" ) }) test_that(".is_sorted works", { expect_true(.is_sorted(1:3)) expect_true(.is_sorted(c("a", "b", "c"))) expect_true(.is_sorted(factor(c("a", "b", "c")))) expect_false(.is_sorted(c(1, 3, 2))) expect_false(.is_sorted(c("b", "a", "c"))) expect_false(.is_sorted(factor(c("b", "a", "c")))) }) ================================================ FILE: tests/testthat/test-utils_cols.R ================================================ test_char <- data.frame( a = c("iso", 2, 5), b = c("year", 3, 6), c = c(NA, 5, 7), stringsAsFactors = FALSE ) test_num <- data.frame( a = c(5, 2, 5), b = c(3, 3, 6), c = c(NA, 5, 7) ) test_na <- data.frame( a = c(NA, 2, 5), b = c(NA, 3, 6), c = c(NA, 5, 7) ) test_that("row_to_colnames works", { test <- row_to_colnames(test_char, verbose = FALSE) expect_identical( colnames(test), c("iso", "year", "x1") ) test <- row_to_colnames(test_num, verbose = FALSE) expect_identical( colnames(test), c("5", "3", "x1") ) test <- row_to_colnames(test_na, verbose = FALSE) expect_identical( colnames(test), c("x1", "x2", "x3") ) }) test_that("row_to_colnames: check arg 'row'", { expect_error( row_to_colnames(test_num, row = "hi", verbose = FALSE), regexp = "Argument `row`" ) expect_error( row_to_colnames(test_num, row = 6), regexp = "You used row = 6" ) expect_error( row_to_colnames(test_num, row = c(3, 5), verbose = FALSE), regexp = "Argument `row`" ) expect_identical( row_to_colnames(test_num, verbose = FALSE), row_to_colnames(test_num, row = 1, verbose = FALSE) ) }) test_that("row_to_colnames: check arg 'na_prefix'", { test <- row_to_colnames(test_char, na_prefix = "foo", verbose = FALSE) expect_identical( colnames(test), c("iso", "year", "foo1") ) test <- row_to_colnames(test_num, na_prefix = "foo", verbose = FALSE) expect_identical( colnames(test), c("5", "3", "foo1") ) }) #----------------------------------------------------- foo <- data.frame( ARG = c("BRA", "FRA"), `1960` = c(1960, 1960), `2000` = c(2000, 2000), stringsAsFactors = FALSE ) test_that("colnames_to_row works", { test <- colnames_to_row(foo) expect_identical( colnames(test), c("x1", "x2", "x3") ) expect_true( all( test[1, 1] == "ARG", test[1, 2] == "X1960", test[1, 3] == "X2000" ) ) expect_s3_class(test, "data.frame") }) test_that("colnames_to_row: check arg 'prefix'", { test <- colnames_to_row(foo, prefix = "hi") expect_identical( colnames(test), c("hi1", "hi2", "hi3") ) expect_error( colnames_to_row(test_num, prefix = 6), regexp = "Argument `prefix`" ) expect_error( colnames_to_row(test_num, prefix = c("A", "B")), regexp = "Argument `prefix`" ) expect_identical( colnames_to_row(test), colnames_to_row(test, prefix = "x") ) }) ================================================ FILE: tests/testthat/test-utils_rows.R ================================================ test_that("rownames_as_column works", { test <- rownames_as_column(mtcars, "new_column") expect_true("new_column" %in% names(test)) expect_identical(test[1, "new_column"], "Mazda RX4") }) test_that("rownames_as_column doesn't work if var is not a character", { expect_error( rownames_as_column(mtcars, var = 1), regexp = "Argument 'var' must be of type character" ) expect_error( rownames_as_column(mtcars, var = TRUE), regexp = "Argument 'var' must be of type character" ) }) test_that("rownames_as_column uses 'rowname' as default column name", { test <- rownames_as_column(mtcars, var = NULL) expect_true("rowname" %in% names(test)) }) test_that("rownames_as_column preserves labels", { test_data <- mtcars test_data <- assign_labels(test_data, select = "hp", variable = "horsepower") # ungrouped with_id <- rownames_as_column(test_data) expect_identical( attributes(with_id$hp)$label, "horsepower" ) # grouped with_id_grouped <- data_group(test_data, "cyl") with_id_grouped <- rownames_as_column(with_id_grouped) expect_identical( attributes(with_id_grouped$hp)$label, "horsepower" ) }) test_that("rownames_as_column preserves other attribs", { test_data <- standardize(mtcars) # ungrouped with_id <- rownames_as_column(test_data) expect_false(is.null(attributes(with_id)$center)) # grouped with_id_grouped <- data_group(test_data, "cyl") with_id_grouped <- rownames_as_column(with_id_grouped) expect_false(is.null(attributes(with_id_grouped)$center)) }) test_that("rownames_as_column errors if already var of same name", { expect_error( rownames_as_column(mtcars, "mpg"), "already a variable named" ) }) #------------------------------------------------- test_that("rowid_as_column works", { test <- rowid_as_column(mtcars, "new_column") expect_true("new_column" %in% names(test)) expect_identical(test$new_column, 1:32) }) test_that("rowid_as_column works with grouped data", { test_data <- data_group(iris, "Species") test <- rowid_as_column(test_data) expect_identical(test$rowid, rep(1:50, 3)) expect_true("rowid" %in% names(test)) }) test_that("rowid_as_column doesn't work if var is not a character", { expect_error( rowid_as_column(mtcars, var = 1), regexp = "Argument 'var' must be of type character" ) expect_error( rowid_as_column(mtcars, var = TRUE), regexp = "Argument 'var' must be of type character" ) }) test_that("rowid_as_column uses 'rowid' as default column name", { test <- rowid_as_column(mtcars, var = NULL) expect_true("rowid" %in% names(test)) }) test_that("rowid_as_column preserves labels", { test_data <- mtcars test_data <- assign_labels(test_data, select = "hp", variable = "horsepower") # ungrouped with_id <- rowid_as_column(test_data) expect_identical( attributes(with_id$hp)$label, "horsepower" ) # grouped with_id_grouped <- data_group(test_data, "cyl") with_id_grouped <- rowid_as_column(with_id_grouped) expect_identical( attributes(with_id_grouped$hp)$label, "horsepower" ) }) test_that("rowid_as_column preserves other attribs", { test_data <- standardize(mtcars) # ungrouped with_id <- rowid_as_column(test_data) expect_false(is.null(attributes(with_id)$center)) # grouped with_id_grouped <- data_group(test_data, "cyl") with_id_grouped <- rowid_as_column(with_id_grouped) expect_false(is.null(attributes(with_id_grouped)$center)) }) test_that("rowid_as_column has no issue if another variable is called 'var'", { foo <- data.frame( grp = c("A", "A", "B", "B"), var = 1:4, stringsAsFactors = FALSE ) out <- data_group(foo, grp) out <- rowid_as_column(out) expect_named(out, c("rowid", "grp", "var")) }) test_that("rowid_as_column errors if already var of same name", { expect_error( rowid_as_column(mtcars, "mpg"), "already a variable named" ) }) #------------------------------------------------- test_that("column_as_rownames works", { continents <- c( "Africa", "Asia", "Europe", "North America", "Oceania", "South America" ) test <- data.frame( continent = continents, some_value = seq(1, 6, by = 1) ) test2 <- column_as_rownames(test, "continent") expect_identical(rownames(test2), continents) expect_identical(ncol(test2), 1L) test3 <- column_as_rownames(test, 1) expect_identical(rownames(test3), continents) expect_identical(ncol(test3), 1L) }) test_that("column_as_rownames sanity checks work", { continents <- c( "Africa", "Asia", "Europe", "North America", "Oceania", "South America" ) test <- data.frame( continent = continents, some_value = seq(1, 6, by = 1) ) expect_error(column_as_rownames(test, TRUE), regexp = "Argument `var`") expect_error( column_as_rownames(test, "foo"), regexp = "not in the data frame" ) expect_error(column_as_rownames(test, 0), regexp = "does not exist") expect_error(column_as_rownames(test, 3), regexp = "does not exist") }) test_that("rownames_as_column and column_as_rownames cancel each other", { test <- rownames_as_column(mtcars) test2 <- column_as_rownames(test) expect_identical(test2, mtcars) }) test_that("column_as_rownames preserves labels", { test_data <- rownames_as_column(mtcars) test_data <- assign_labels(test_data, select = "hp", variable = "horsepower") # ungrouped with_id <- column_as_rownames(test_data) expect_identical( attributes(with_id$hp)$label, "horsepower" ) # grouped with_id_grouped <- data_group(test_data, "cyl") with_id_grouped <- column_as_rownames(with_id_grouped) expect_identical( attributes(with_id_grouped$hp)$label, "horsepower" ) }) test_that("column_as_rownames preserves other attribs", { test_data <- rownames_as_column(standardize(mtcars)) # ungrouped with_id <- column_as_rownames(test_data, "rowname") expect_false(is.null(attributes(with_id)$center)) # grouped with_id_grouped <- data_group(test_data, "cyl") with_id_grouped <- column_as_rownames(with_id_grouped) expect_false(is.null(attributes(with_id_grouped)$center)) }) ================================================ FILE: tests/testthat/test-weighted-stats.R ================================================ test_that("weighted centrality and dispersion measures work as expected", { x <- c(3.7, 3.3, 3.5, 2.8) wt <- c(5, 5, 4, 1) / 15 set.seed(123) expect_equal(weighted_mean(x, wt), 3.453333, tolerance = 0.001) expect_equal(weighted_median(x, wt), 3.5, tolerance = 0.001) expect_equal(weighted_sd(x, wt), 0.2852935, tolerance = 0.001) expect_equal(weighted_mad(x, wt), 0.29652, tolerance = 0.001) }) test_that("weighted centrality and dispersion measures work with NA", { x <- c(3.7, 3.3, NA, 3.5, 2.8, 5.5) wt <- c(5, 5, 4, NA, 1, 7) / 15 set.seed(123) expect_equal(weighted_mean(x, wt), 4.238889, tolerance = 0.001) expect_equal(weighted_median(x, wt), 3.7, tolerance = 0.001) expect_equal(weighted_sd(x, wt), 1.237671, tolerance = 0.001) expect_equal(weighted_mad(x, wt), 0.59304, tolerance = 0.001) }) test_that("weighted centrality and dispersion measures work with NA when not removed", { x <- c(3.7, 3.3, NA, 3.5, 2.8, 5.5) wt <- c(5, 5, 4, NA, 1, 7) / 15 set.seed(123) expect_identical(weighted_mean(x, wt, remove_na = FALSE), NA_real_) expect_identical(weighted_median(x, wt, remove_na = FALSE), NA_real_) expect_identical(weighted_sd(x, wt, remove_na = FALSE), NA_real_) expect_identical(weighted_mad(x, wt, remove_na = FALSE), NA_real_) }) test_that("weighted centrality and dispersion measures work with Inf", { x <- c(3.7, 3.3, NA, 3.5, 2.8, 5.5, Inf, 4) wt <- c(5, 5, 4, NA, 1, 7, 3, Inf) / 15 set.seed(123) expect_equal(weighted_mean(x, wt), 4.238889, tolerance = 0.001) expect_equal(weighted_median(x, wt), 3.7, tolerance = 0.001) expect_equal(weighted_sd(x, wt), 1.237671, tolerance = 0.001) expect_equal(weighted_mad(x, wt), 0.59304, tolerance = 0.001) }) ================================================ FILE: tests/testthat/test-winsorization.R ================================================ test_that("testing Winsorization of factors", { expect_identical(winsorize(as.factor(mtcars$am)), as.factor(mtcars$am)) }) test_that("with missing values", { skip_if_not_installed("ggplot2") expect_snapshot(suppressWarnings(head(winsorize(na.omit( ggplot2::msleep$brainwt ))))) expect_length(winsorize(as.factor(ggplot2::msleep$vore)), 83L) }) test_that("winsorize: threshold must be between 0 and 1", { expect_error( winsorize(sample(1:10, 5), threshold = -0.1), regexp = "must be a scalar between 0 and 0.5" ) expect_error( winsorize(sample(1:10, 5), threshold = 1.1), regexp = "must be a scalar between 0 and 0.5" ) expect_error( winsorize(sample(1:10, 5), method = "zscore", threshold = -3), regexp = "must be a scalar greater than 0" ) expect_error( winsorize( sample(1:10, 5), method = "zscore", threshold = -3, robust = TRUE ), regexp = "must be a scalar greater than 0" ) expect_error( winsorize(sample(1:10, 5), method = "raw", threshold = 1.1), regexp = "must be of length 2 for lower and upper bound" ) }) test_that("winsorize on data.frame", { iris2 <- winsorize(iris) expect_identical( iris2$Sepal.Length, winsorize(iris$Sepal.Length) ) expect_identical( iris2$Petal.Width, winsorize(iris$Petal.Width) ) expect_named(iris2, names(iris)) }) ================================================ FILE: tests/testthat.R ================================================ library(testthat) library(datawizard) test_check("datawizard") ================================================ FILE: vignettes/.gitignore ================================================ *.html *.R ================================================ FILE: vignettes/bibliography.bib ================================================ @Manual{revelle2018, title = {psych: Procedures for Psychological, Psychometric, and Personality Research}, author = {William Revelle}, organization = { Northwestern University}, address = { Evanston, Illinois}, year = {2018}, note = {R package version 1.8.12}, url = {https://CRAN.R-project.org/package=psych} } @article{makowski2018psycho, title={The psycho package: An efficient and publishing-oriented workflow for psychological science}, author={Makowski, Dominique}, journal={Journal of Open Source Software}, volume={3}, number={22}, pages={470}, year={2018} } @article{menard2011standards, title={Standards for standardized logistic regression coefficients}, author={Menard, Scott}, journal={Social Forces}, volume={89}, number={4}, pages={1409--1428}, year={2011}, publisher={The University of North Carolina Press} } @article{schielzeth2010simple, title={Simple means to improve the interpretability of regression coefficients}, author={Schielzeth, Holger}, journal={Methods in Ecology and Evolution}, volume={1}, number={2}, pages={103--113}, year={2010}, publisher={Wiley Online Library} } @article{gelman2008scaling, title={Scaling regression inputs by dividing by two standard deviations}, author={Gelman, Andrew}, journal={Statistics in medicine}, volume={27}, number={15}, pages={2865--2873}, year={2008}, publisher={Wiley Online Library} } @article{menard2004six, title={Six approaches to calculating standardized logistic regression coefficients}, author={Menard, Scott}, journal={The American Statistician}, volume={58}, number={3}, pages={218--223}, year={2004}, publisher={Taylor \& Francis} } @article{bring1994standardize, title={How to standardize regression coefficients}, author={Bring, Johan}, journal={The American Statistician}, volume={48}, number={3}, pages={209--213}, year={1994}, publisher={Taylor \& Francis} } @article{neter1989applied, title={Applied linear regression models}, author={Neter, John and Wasserman, William and Kutner, Michael H}, year={1989}, publisher={Irwin Homewood, IL} } @book{gelman_data_2007, address = {Cambridge; New York}, series = {Analytical methods for social research}, title = {Data analysis using regression and multilevel/hierarchical models}, isbn = {978-0-521-86706-1 978-0-521-68689-1}, publisher = {Cambridge University Press}, author = {Gelman, Andrew and Hill, Jennifer}, year = {2007} } @inproceedings{bafumi_fitting_2006, address = {Philadelphia, PA}, title = {Fitting Multilevel Models When Predictors and Group Effects Correlate.}, author = {Bafumi, Joseph and Gelman, Andrew}, year = {2006} } @article{bell_fixed_2019, title = {Fixed and random effects models: making an informed choice}, issn = {1573-7845}, url = {https://doi.org/10.1007/s11135-018-0802-x}, doi = {10.1007/s11135-018-0802-x}, journal = {Quality \& Quantity}, author = {Bell, Andrew and Fairbrother, Malcolm and Jones, Kelvyn}, volume = {53}, year = {2019}, pages = {1051--1074} } @article{bell_explaining_2015, title = {Explaining Fixed Effects: Random Effects Modeling of Time-Series Cross-Sectional and Panel Data}, volume = {3}, issn = {2049-8470, 2049-8489}, url = {https://www.cambridge.org/core/product/identifier/S2049847014000077/type/journal_article}, doi = {10.1017/psrm.2014.7}, number = {1}, journal = {Political Science Research and Methods}, author = {Bell, Andrew and Jones, Kelvyn}, month = jan, year = {2015}, pages = {133--153} } @article{heisig_costs_2017, title = {The Costs of Simplicity: Why Multilevel Models May Benefit from Accounting for Cross-Cluster Differences in the Effects of Controls}, volume = {82}, issn = {0003-1224, 1939-8271}, url = {http://journals.sagepub.com/doi/10.1177/0003122417717901}, doi = {10.1177/0003122417717901}, number = {4}, journal = {American Sociological Review}, author = {Heisig, Jan Paul and Schaeffer, Merlin and Giesecke, Johannes}, month = aug, year = {2017}, pages = {796--827} } @article{shor_bayesian_2007, title = {A Bayesian Multilevel Modeling Approach to Time-Series Cross-Sectional Data}, volume = {15}, issn = {1047-1987, 1476-4989}, doi = {10.1093/pan/mpm006}, number = {2}, journal = {Political Analysis}, author = {Shor, Boris and Bafumi, Joseph and Keele, Luke and Park, David}, year = {2007}, pages = {165--181} } @article{mundlak_pooling_1978, title = {On the Pooling of Time Series and Cross Section Data}, volume = {46}, number = {1}, journal = {Econometrica}, author = {Mundlak, Yair}, month = jan, year = {1978}, pages = {69} } ================================================ FILE: vignettes/overview_of_vignettes.Rmd ================================================ --- title: "Overview of Vignettes" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Overview of Vignettes} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r message=FALSE, warning=FALSE, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, collapse = TRUE, warning = FALSE, message = FALSE, comment = "#>", eval = TRUE ) ``` All package vignettes are available at [https://easystats.github.io/datawizard/](https://easystats.github.io/datawizard/). ## Function Overview * [Function Reference](https://easystats.github.io/datawizard/reference/index.html) ## Data Preparation * [Coming from 'tidyverse'](https://easystats.github.io/datawizard/articles/tidyverse_translation.html) * [A quick summary of selection syntax in `{datawizard}`](https://easystats.github.io/datawizard/articles/selection_syntax.html) ## Statistical Transformations * [Data Standardization](https://easystats.github.io/datawizard/articles/standardize_data.html) ================================================ FILE: vignettes/selection_syntax.Rmd ================================================ --- title: "A quick summary of selection syntax in `{datawizard}`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{A quick summary of selection syntax in `{datawizard}`} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) pkgs <- c( "datawizard", "dplyr" ) if (!all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L)))) { knitr::opts_chunk$set(eval = FALSE) } ``` ```{r load, echo=FALSE, message=FALSE} library(datawizard) library(dplyr) set.seed(123) iris <- iris[sample(nrow(iris), 10), ] row.names(iris) <- NULL ``` ```{css, echo=FALSE} .custom_note { border-left: solid 5px hsl(220, 100%, 30%); background-color: hsl(220, 100%, 95%); padding: 5px; margin-bottom: 10px } ``` This vignette can be referred to by citing the following: Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. *Journal of Open Source Software*, *7*(78), 4684, https://doi.org/10.21105/joss.04684 # Selecting variables ## Quoted names This is the most simple way to select one or several variables. Just use a character vector containing variables names, like in base R. ```{r} data_select(iris, c("Sepal.Length", "Petal.Width")) ``` ## Unquoted names It is also possible to use unquoted names. This is useful if we use the `tidyverse` and want to be consistent about the way variable names are passed. ```{r} iris %>% group_by(Species) %>% standardise(Petal.Length) %>% ungroup() ``` ## Positions In addition to variable names, `select` can also take indices for the variables to select in the dataframe. ```{r} data_select(iris, c(1, 2, 5)) ``` ## Functions We can also pass a function to the `select` argument. This function will be applied to all columns and should return `TRUE` or `FALSE`. For example, if we want to keep only numeric columns, we can use `is.numeric`. ```{r} data_select(iris, is.numeric) ``` Note that we can provide any custom function to `select`, *provided it returns `TRUE` or `FALSE`* when applied to a column. ```{r} my_function <- function(i) { is.numeric(i) && mean(i, na.rm = TRUE) > 3.5 } data_select(iris, my_function) ``` ## Patterns With larger datasets, it would be tedious to write the names of variables to select, and it would be fragile to rely on variable positions as they may change later. To this end, we can use four select helpers: `starts_with()`, `ends_with()`, `contains()`, and `regex()`. The first three can take several patterns, while `regex()` takes a single regular expression. ```{r} data_select(iris, starts_with("Sep", "Peta")) data_select(iris, ends_with("dth", "ies")) data_select(iris, contains("pal", "ec")) data_select(iris, regex("^Sep|ies")) ```

Note: these functions are not exported by `datawizard` but are detected and applied internally. This means that they won't be detected by autocompletion when we write them.

Note #2: because these functions are not exported, they will not create conflicts with the ones that come from the `tidyverse` and that have the same name. Therefore, we can still use `dplyr` and its friends, it won't change anything for selection in `datawizard` functions!

# Excluding variables What if we want to keep all variables except for a few ones? There are two ways we can invert our selection. The first way is to put a minus sign `"-"` in front of the `select` argument. ```{r} data_select(iris, -c("Sepal.Length", "Petal.Width")) data_select(iris, -starts_with("Sep", "Peta")) data_select(iris, -is.numeric) ``` Note that if we use numeric indices, we can't mix negative and positive values. This means that we have to use `select = -(1:2)` if we want to exclude the first two columns; `select = -1:2` will *not* work: ```{r} data_select(iris, -(1:2)) ``` Same thing for variable names: ```{r} data_select(iris, -(Petal.Length:Species)) ``` The second way is to use the argument `exclude`. This argument has the same possibilities as `select`. Although this may not be required in most contexts, if we wanted to, we could use both `select` and `exclude` arguments at the same time. ```{r} data_select(iris, exclude = c("Sepal.Length", "Petal.Width")) data_select(iris, exclude = starts_with("Sep", "Peta")) ``` # Programming with selections Since `datawizard` 0.6.0, it is possible to pass function arguments and loop indices in `select` and `exclude` arguments. This makes it easier to program with `datawizard`. For example, if we want to let the user decide the selection they want to use: ```{r} my_function <- function(data, selection) { extract_column_names(data, select = selection) } my_function(iris, "Sepal.Length") my_function(iris, starts_with("Sep")) my_function_2 <- function(data, pattern) { extract_column_names(data, select = starts_with(pattern)) } my_function_2(iris, "Sep") ``` It is also possible to pass these values in loops, for example if we have a list of patterns and we want to relocate columns based on these patterns, one by one: ```{r} new_iris <- iris for (i in c("Sep", "Pet")) { new_iris <- new_iris %>% data_relocate(select = starts_with(i), after = -1) } new_iris ``` In the loop above, all columns starting with `"Sep"` are moved at the end of the data frame, and the same thing was made with all columns starting with `"Pet"`. # Useful to know ## Ignore the case In every selection that uses variable names, we can ignore the case in the selection by applying `ignore_case = TRUE`. ```{r} data_select(iris, c("sepal.length", "petal.width"), ignore_case = TRUE) data_select(iris, ~ Sepal.length + petal.Width, ignore_case = TRUE) data_select(iris, starts_with("sep", "peta"), ignore_case = TRUE) ``` ## Formulas It is also possible to use formulas to select variables: ```{r} data_select(iris, ~ Sepal.Length + Petal.Width) ``` This made it easier to use selection in custom functions before `datawizard` 0.6.0, and is kept available for backward compatibility. ================================================ FILE: vignettes/standardize_data.Rmd ================================================ --- title: "Data Standardization" output: rmarkdown::html_vignette: toc: true fig_width: 10.08 fig_height: 6 vignette: > \usepackage[utf8]{inputenc} %\VignetteIndexEntry{Data Standardization} %\VignetteEngine{knitr::rmarkdown} --- ```{r message=FALSE, warning=FALSE, include=FALSE} options(knitr.kable.NA = "") knitr::opts_chunk$set( comment = "#>", message = FALSE, warning = FALSE, dpi = 300 ) pkgs <- c( "datawizard", "poorman", "see", "ggplot2", "parameters", "lme4", "curl" ) pkg_available <- all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L))) if (pkg_available) { net_available <- curl::has_internet() } else { net_available <- FALSE } if (!pkg_available || !net_available) { knitr::opts_chunk$set(eval = FALSE) } ``` This vignette can be referred to by citing the following: > Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. *Journal of Open Source Software*, *7*(78), 4684, https://doi.org/10.21105/joss.04684 # Introduction To make sense of their data and effects, scientists might want to standardize (Z-score) their variables. This makes the data unitless, expressed only in terms of deviation from an index of centrality (e.g., the mean or the median). However, aside from some benefits, standardization also comes with challenges and issues, that the scientist should be aware of. ## Methods of Standardization The `datawizard` package offers two methods of standardization via the `standardize()` function: - **Normal standardization**: center around the *mean*, with *SD* units (default). - **Robust standardization**: center around the *median*, with *MAD* (median absolute deviation) units (`robust = TRUE`). Let's look at the following example: ```{r} library(datawizard) library(effectsize) # for data # let's have a look at what the data look like data("hardlyworking", package = "effectsize") head(hardlyworking) # let's use both methods of standardization hardlyworking$xtra_hours_z <- standardize(hardlyworking$xtra_hours) hardlyworking$xtra_hours_zr <- standardize(hardlyworking$xtra_hours, robust = TRUE) ``` We can see that different methods give different central and variation values: ```{r, eval=FALSE} library(dplyr) hardlyworking %>% select(starts_with("xtra_hours")) %>% data_to_long() %>% group_by(Name) %>% summarise( mean = mean(Value), sd = sd(Value), median = median(Value), mad = mad(Value) ) ``` ```{r, echo=FALSE} library(poorman) hardlyworking %>% select(starts_with("xtra_hours")) %>% reshape_longer(names_to = "name", values_to = "value") %>% group_by(name) %>% summarise( mean = mean(value), sd = sd(value), median = median(value), mad = mad(value) ) %>% knitr::kable(digits = 4) ``` `standardize()` can also be used to standardize a full data frame - where each numeric variable is standardized separately: ```{r} hardlyworking_z <- standardize(hardlyworking) ``` ```{r, eval=FALSE} hardlyworking_z %>% select(-xtra_hours_z, -xtra_hours_zr) %>% data_to_long() %>% group_by(Name) %>% summarise( mean = mean(Value), sd = sd(Value), median = median(Value), mad = mad(Value) ) ``` ```{r, echo=FALSE} hardlyworking_z %>% select(-xtra_hours_z, -xtra_hours_zr) %>% reshape_longer(names_to = "name", values_to = "value") %>% group_by(name) %>% summarise( mean = mean(value), sd = sd(value), median = median(value), mad = mad(value) ) %>% knitr::kable(digits = 4) ``` Weighted standardization is also supported via the `weights` argument, and factors can also be standardized (if you're into that kind of thing) by setting `force = TRUE`, which converts factors to treatment-coded dummy variables before standardizing. ## Variable-wise *vs.* Participant-wise Standardization is an important step and extra caution is required in **repeated-measures designs**, in which there are three ways of standardizing data: - **Variable-wise**: The most common method. A simple scaling of each column. - **Participant-wise**: Variables are standardized "within" each participant, *i.e.*, for each participant, by the participant's mean and SD. - **Full**: Participant-wise first and then re-standardizing variable-wise. Unfortunately, the method used is often not explicitly stated. This is an issue as these methods can generate important discrepancies (that can in turn contribute to the reproducibility crisis). Let's investigate these 3 methods. ### The Data We will take the `emotion` dataset in which participants were exposed to negative pictures and had to rate their emotions (**valence**) and the amount of memories associated with the picture (**autobiographical link**). One could make the hypothesis that for young participants with no context of war or violence, the most negative pictures (mutilations) are less related to memories than less negative pictures (involving for example car crashes or sick people). In other words, **we expect a positive relationship between valence** (with high values corresponding to less negativity) **and autobiographical link**. Let's have a look at the data, averaged by participants: ```{r, eval=FALSE} # Download the 'emotion' dataset load(url("https://raw.githubusercontent.com/neuropsychology/psycho.R/master/data/emotion.rda")) # Discard neutral pictures (keep only negative) emotion <- emotion %>% filter(Emotion_Condition == "Negative") # Summary emotion %>% drop_na(Subjective_Valence, Autobiographical_Link) %>% group_by(Participant_ID) %>% summarise( n_Trials = n(), Valence_Mean = mean(Subjective_Valence), Valence_SD = sd(Subjective_Valence) ) ``` ```{r, echo=FALSE} load(url("https://raw.githubusercontent.com/neuropsychology/psycho.R/master/data/emotion.rda")) # Discard neutral pictures (keep only negative) emotion <- emotion %>% filter(Emotion_Condition == "Negative") # Summary emotion %>% subset(!(is.na(Subjective_Valence) | is.na(Autobiographical_Link))) %>% group_by(Participant_ID) %>% summarise( n_Trials = n(), Valence_Mean = mean(Subjective_Valence), Valence_SD = sd(Subjective_Valence) ) ``` As we can see from the means and SDs, there is a lot of variability **between** participants both in their means and their individual *within*-participant SD. ### Effect of Standardization We will create three data frames standardized with each of the three techniques. ```{r, warning=FALSE} Z_VariableWise <- emotion %>% standardize() Z_ParticipantWise <- emotion %>% group_by(Participant_ID) %>% standardize() Z_Full <- emotion %>% group_by(Participant_ID) %>% standardize() %>% ungroup() %>% standardize() ``` Let's see how these three standardization techniques affected the **Valence** variable. ### Across Participants We can calculate the mean and SD of *Valence* across all participants: ```{r, eval=FALSE} # Create a convenient function to print summarise_Subjective_Valence <- function(data) { df_name <- deparse(substitute(data)) data %>% ungroup() %>% summarise( DF = df_name, Mean = mean(Subjective_Valence), SD = sd(Subjective_Valence) ) } # Check the results rbind( summarise_Subjective_Valence(Z_VariableWise), summarise_Subjective_Valence(Z_ParticipantWise), summarise_Subjective_Valence(Z_Full) ) ``` ```{r, echo=FALSE} # Create a convenient function to print summarise_Subjective_Valence <- function(data) { df_name <- deparse(substitute(data)) data <- data %>% ungroup() %>% summarise( Mean = mean(Subjective_Valence), SD = sd(Subjective_Valence) ) cbind(DF = df_name, data) } # Check the results rbind( summarise_Subjective_Valence(Z_VariableWise), summarise_Subjective_Valence(Z_ParticipantWise), summarise_Subjective_Valence(Z_Full) ) %>% knitr::kable(digits = 2) ``` The **means** and the **SD** appear as fairly similar (0 and 1)... ```{r, fig.width=7, fig.height=4.5, results='markup', fig.align='center'} library(see) library(ggplot2) ggplot() + geom_density(aes(Z_VariableWise$Subjective_Valence, color = "Z_VariableWise" ), linewidth = 1) + geom_density(aes(Z_ParticipantWise$Subjective_Valence, color = "Z_ParticipantWise" ), linewidth = 1) + geom_density(aes(Z_Full$Subjective_Valence, color = "Z_Full" ), linewidth = 1) + see::theme_modern() + labs(color = "") ``` and so do the marginal distributions... ### At the Participant Level However, we can also look at what happens in the participant level. Let's look at the first 5 participants: ```{r, eval=FALSE} # Create convenient function print_participants <- function(data) { df_name <- deparse(substitute(data)) data %>% group_by(Participant_ID) %>% summarise( DF = df_name, Mean = mean(Subjective_Valence), SD = sd(Subjective_Valence) ) %>% head(5) %>% select(DF, everything()) } # Check the results rbind( print_participants(Z_VariableWise), print_participants(Z_ParticipantWise), print_participants(Z_Full) ) ``` ```{r, echo=FALSE} # Create convenient function print_participants <- function(data) { df_name <- deparse(substitute(data)) data %>% group_by(Participant_ID) %>% summarise( Mean = mean(Subjective_Valence), SD = sd(Subjective_Valence) ) %>% cbind(DF = df_name, .) %>% head(5) %>% select(DF, everything()) } # Check the results rbind( print_participants(Z_VariableWise), print_participants(Z_ParticipantWise), print_participants(Z_Full) ) %>% knitr::kable(digits = 2) ``` Seems like *full* and *participant-wise* standardization give similar results, but different ones than *variable-wise* standardization. ### Compare Let's do a **correlation** between the **variable-wise and participant-wise methods**. ```{r, fig.width=7, fig.height=4.5, results='markup', fig.align='center'} r <- cor.test( Z_VariableWise$Subjective_Valence, Z_ParticipantWise$Subjective_Valence ) data.frame( Original = emotion$Subjective_Valence, VariableWise = Z_VariableWise$Subjective_Valence, ParticipantWise = Z_ParticipantWise$Subjective_Valence ) %>% ggplot(aes(x = VariableWise, y = ParticipantWise, colour = Original)) + geom_point(alpha = 0.75, shape = 16) + geom_smooth(method = "lm", color = "black") + scale_color_distiller(palette = 1) + ggtitle(paste0("r = ", round(r$estimate, 2))) + see::theme_modern() ``` While the three standardization methods roughly present the same characteristics at a general level (mean 0 and SD 1) and a similar distribution, their values are not exactly the same! Let's now answer the original question by investigating the **linear relationship between valence and autobiographical link**. We can do this by running a mixed-effects model with participants entered as random effects. ```{r} library(lme4) m_raw <- lmer( formula = Subjective_Valence ~ Autobiographical_Link + (1 | Participant_ID), data = emotion ) m_VariableWise <- update(m_raw, data = Z_VariableWise) m_ParticipantWise <- update(m_raw, data = Z_ParticipantWise) m_Full <- update(m_raw, data = Z_Full) ``` We can extract the parameters of interest from each model, and find: ```{r} # Convenient function get_par <- function(model) { mod_name <- deparse(substitute(model)) parameters::model_parameters(model) %>% mutate(Model = mod_name) %>% select(-Parameter) %>% select(Model, everything()) %>% .[-1, ] } # Run the model on all datasets rbind( get_par(m_raw), get_par(m_VariableWise), get_par(m_ParticipantWise), get_par(m_Full) ) ``` As we can see, **variable-wise** standardization only affects **the coefficient** (which is expected, as it changes the unit), but not the test statistic or statistical significance. However, using **participant-wise** standardization *does* affect the coefficient **and** the significance. **No method is better or more justified, and the choice depends on the specific case, context, data and goal.** ### Conclusion 1. **Standardization can be useful in *some* cases and should be justified**. 2. **Variable and Participant-wise standardization methods *appear* to produce similar data**. 3. **Variable and Participant-wise standardization can lead to different results**. 4. **The chosen method can strongly influence the results and should therefore be explicitly stated and justified to enhance reproducibility of results**. We showed here yet another way of **sneakily tweaking the data** that can change the results. To prevent its use as a bad practice, we can only highlight the importance of open data, open analysis/scripts, and preregistration. # See also - `datawizard::demean()`: - `standardize_parameters(method = "pseudo")` for mixed-effects models # References ================================================ FILE: vignettes/tidyverse_translation.Rmd ================================================ --- title: "Coming from 'tidyverse'" output: rmarkdown::html_vignette: toc: true vignette: > \usepackage[utf8]{inputenc} %\VignetteIndexEntry{Coming from 'tidyverse'} %\VignetteEngine{knitr::rmarkdown} --- ```{r setup, message=FALSE, warning=FALSE, include=FALSE, eval = TRUE} library(knitr) options(knitr.kable.NA = "") knitr::opts_chunk$set( eval = FALSE, message = FALSE, warning = FALSE, dpi = 300 ) pkgs <- c( "dplyr", "tidyr" ) all_deps_available <- all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L))) if (all_deps_available) { library(datawizard) library(dplyr) library(tidyr) } # Since we explicitly put `eval = TRUE` for some chunks, we can't rely on # `knitr::opts_chunk$set(eval = FALSE)` at the beginning of the script. # Therefore, we introduce a logical that is `FALSE` only if all suggested # dependencies are not installed (cf easystats/easystats#317) evaluate_chunk <- all_deps_available && getRversion() >= "4.1.0" ``` This vignette can be referred to by citing the following: Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. *Journal of Open Source Software*, *7*(78), 4684, https://doi.org/10.21105/joss.04684 ```{css, echo=FALSE, eval = TRUE} .datawizard, .datawizard > .sourceCode { background-color: #e6e6ff; } .tidyverse, .tidyverse > .sourceCode { background-color: #d9f2e5; } .custom_note { border-left: solid 5px hsl(220, 100%, 30%); background-color: hsl(220, 100%, 95%); padding: 5px; margin-bottom: 10px } ``` # Introduction `{datawizard}` package aims to make basic data wrangling easier than with base R. The data wrangling workflow it supports is similar to the one supported by the tidyverse package combination of `{dplyr}` and `{tidyr}`. However, one of its main features is that it has a very few dependencies: `{stats}` and `{utils}` (included in base R) and `{insight}`, which is the core package of the _easystats_ ecosystem. This package grew organically to simultaneously satisfy the "0 non-base hard dependency" principle of _easystats_ and the data wrangling needs of the constituent packages in this ecosystem. It is also important to note that `{datawizard}` was designed to avoid namespace collisions with `{tidyverse}` packages. In this article, we will see how to go through basic data wrangling steps with `{datawizard}`. We will also compare it to the `{tidyverse}` syntax for achieving the same. This way, if you decide to make the switch, you can easily find the translations here. This vignette is largely inspired from `{dplyr}`'s [Getting started vignette](https://dplyr.tidyverse.org/articles/dplyr.html).

Note: In this vignette, we use the native pipe-operator, `|>`, which was introduced in R 4.1. Users of R version 3.6 or 4.0 should replace the native pipe by magrittr's one (`%>%`) so that examples work.

```{r, eval = evaluate_chunk} library(dplyr) library(tidyr) library(datawizard) data(efc) efc <- head(efc) ``` # Workhorses Before we look at their *tidyverse* equivalents, we can first have a look at `{datawizard}`'s key functions for data wrangling: | Function | Operation | | :---------------- | :--------------------------------------------------------------- | | `data_filter()` | [to select only certain observations](#filtering) | | `data_select()` | [to select only a few variables](#selecting) | | `data_modify()` | [to create variables or modify existing ones](#modifying) | | `data_arrange()` | [to sort observations](#sorting) | | `data_extract()` | [to extract a single variable](#extracting) | | `data_rename()` | [to rename variables](#renaming) | | `data_relocate()` | [to reorder a data frame](#relocating) | | `data_to_long()` | [to convert data from wide to long](#reshaping) | | `data_to_wide()` | [to convert data from long to wide](#reshaping) | | `data_join()` | [to join two data frames](#joining) | | `data_unite()` | [to concatenate several columns into a single one](#uniting) | | `data_separate()` | [to separate a single column into multiple columns](#separating) | Note that there are a few functions in `{datawizard}` that have no strict equivalent in `{dplyr}` or `{tidyr}` (e.g `data_rotate()`), and so we won't discuss them in the next section. # Equivalence with `{dplyr}` / `{tidyr}` Before we look at them individually, let's first have a look at the summary table of this equivalence. | Function | Tidyverse equivalent(s) | | :---------------- | :------------------------------------------------------------------ | | `data_filter()` | `dplyr::filter()`, `dplyr::slice()` | | `data_select()` | `dplyr::select()` | | `data_modify()` | `dplyr::mutate()` | | `data_arrange()` | `dplyr::arrange()` | | `data_extract()` | `dplyr::pull()` | | `data_rename()` | `dplyr::rename()` | | `data_relocate()` | `dplyr::relocate()` | | `data_to_long()` | `tidyr::pivot_longer()` | | `data_to_wide()` | `tidyr::pivot_wider()` | | `data_join()` | `dplyr::inner_join()`, `dplyr::left_join()`, `dplyr::right_join()`, | | | `dplyr::full_join()`, `dplyr::anti_join()`, `dplyr::semi_join()` | | `data_peek()` | `dplyr::glimpse()` | | `data_unite()` | `tidyr::unite()` | | `data_separate()` | `tidyr::separate()` | ## Filtering {#filtering} `data_filter()` is a wrapper around `subset()`. However, if you want to have several filtering conditions, you can either use `&` (as in `subset()`) or `,` (as in `dplyr::filter()`). :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r filter, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_filter( skin_color == "light", eye_color == "brown" ) # or starwars |> data_filter( skin_color == "light" & eye_color == "brown" ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> filter( skin_color == "light", eye_color == "brown" ) ``` ::: :::: ```{r filter, eval = evaluate_chunk, echo = FALSE} ``` ```{r, echo = FALSE, eval = evaluate_chunk} starwars <- head(starwars) ``` ## Selecting {#selecting} `data_select()` is the equivalent of `dplyr::select()`. The main difference between these two functions is that `data_select()` uses two arguments (`select` and `exclude`) and requires quoted column names if we want to select several variables, while `dplyr::select()` accepts any unquoted column names. :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r select1, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_select(select = c("hair_color", "skin_color", "eye_color")) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> select(hair_color, skin_color, eye_color) ``` ::: :::: ```{r select1, eval = evaluate_chunk, echo = FALSE} ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r select2, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_select(select = -ends_with("color")) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> select(-ends_with("color")) ``` ::: :::: ```{r select2, eval = evaluate_chunk, echo = FALSE} ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r select3, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_select(select = -(hair_color:eye_color)) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> select(!(hair_color:eye_color)) ``` ::: :::: ```{r select3, eval = evaluate_chunk, echo = FALSE} ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r select4, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_select(exclude = regex("color$")) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> select(-contains("color$")) ``` ::: :::: ```{r select4, eval = evaluate_chunk, echo = FALSE} ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r select5, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_select(select = is.numeric) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> select(where(is.numeric)) ``` ::: :::: ```{r select5, eval = evaluate_chunk, echo = FALSE} ``` You can find a list of all the select helpers with `?data_select`. ## Modifying {#modifying} `data_modify()` is a wrapper around `base::transform()` but has several additional benefits: * it allows us to use newly created variables in the following expressions; * it works with grouped data; * it preserves variable attributes such as labels; * it accepts expressions as character vectors so that it is easy to program with it This last point is also the main difference between `data_modify()` and `dplyr::mutate()`. :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r modify1, class.source = "datawizard"} # ---------- datawizard ----------- efc |> data_modify( c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), c12hour_z2 = standardize(c12hour) ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- efc |> mutate( c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), c12hour_z2 = standardize(c12hour) ) ``` ::: :::: ```{r modify1, eval = evaluate_chunk, echo = FALSE} ``` `data_modify()` supports expressions as strings via its `as_expr()` helper function. ```{r eval=evaluate_chunk} new_exp <- c( "c12hour_c = center(c12hour)", "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)" ) data_modify(efc, as_expr(new_exp)) ``` This makes it easy to use it in custom functions: ```{r eval=evaluate_chunk} miles_to_km <- function(data, var) { data_modify( data, as_expr(paste0("km = ", var, "* 1.609344")) ) } distance <- data.frame(miles = c(1, 8, 233, 88, 9)) distance miles_to_km(distance, "miles") ``` ## Sorting {#sorting} `data_arrange()` is the equivalent of `dplyr::arrange()`. It takes two arguments: a data frame, and a vector of column names used to sort the rows. Note that contrary to most other functions in `{datawizard}`, it is not possible to use select helpers such as `starts_with()` in `data_arrange()`. :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} :::{} ```{r arrange1, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_arrange(c("hair_color", "height")) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> arrange(hair_color, height) ``` ::: :::: ```{r arrange1, eval = evaluate_chunk, echo = FALSE} ``` You can also sort variables in descending order by putting a `"-"` in front of their name, like below: :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} :::{} ```{r arrange2, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_arrange(c("-hair_color", "-height")) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> arrange(desc(hair_color), -height) ``` ::: :::: ```{r arrange2, eval = evaluate_chunk, echo = FALSE} ``` ## Extracting {#extracting} Although we mostly work on data frames, it is sometimes useful to extract a single column as a vector. This can be done with `data_extract()`, which reproduces the behavior of `dplyr::pull()`: :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} :::{} ```{r extract1, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_extract(gender) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> pull(gender) ``` ::: :::: ```{r extract1, eval = evaluate_chunk, echo = FALSE} ``` We can also specify several variables in `select`. In this case, `data_extract()` is equivalent to `data_select()`: ```{r eval = evaluate_chunk} starwars |> data_extract(select = contains("color")) ``` ## Renaming {#renaming} `data_rename()` is the equivalent of `dplyr::rename()` but the syntax between the two is different. While `dplyr::rename()` takes new-old pairs of column names, `data_rename()` requires a vector of column names to rename, and then a vector of new names for these columns that must be of the same length. :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r rename1, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_rename( select = c("sex", "hair_color"), replacement = c("Sex", "Hair Color") ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> rename( Sex = sex, "Hair Color" = hair_color ) ``` ::: :::: ```{r rename1, eval = evaluate_chunk, echo = FALSE} ``` The way `data_rename()` is designed makes it easy to apply the same modifications to a vector of column names. For example, we can remove underscores and use TitleCase with the following code: ```{r rename2} to_rename <- names(starwars) starwars |> data_rename( select = to_rename, replacement = tools::toTitleCase(gsub("_", " ", to_rename, fixed = TRUE)) ) ``` ```{r rename2, eval = evaluate_chunk, echo = FALSE} ``` It is also possible to add a prefix or a suffix to all or a subset of variables with `data_addprefix()` and `data_addsuffix()`. The argument `select` accepts all select helpers that we saw above with `data_select()`: ```{r rename3} starwars |> data_addprefix( pattern = "OLD.", select = contains("color") ) |> data_addsuffix( pattern = ".NEW", select = -contains("color") ) ``` ```{r rename3, eval = evaluate_chunk, echo = FALSE} ``` ## Relocating {#relocating} Sometimes, we want to relocate one or a small subset of columns in the dataset. Rather than typing many names in `data_select()`, we can use `data_relocate()`, which is the equivalent of `dplyr::relocate()`. Just like `data_select()`, we can specify a list of variables we want to relocate with `select` and `exclude`. Then, the arguments `before` and `after`^[Note that we use `before` and `after` whereas `dplyr::relocate()` uses `.before` and `.after`.] specify where the selected columns should be relocated: :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r relocate1, class.source = "datawizard"} # ---------- datawizard ----------- starwars |> data_relocate(sex:homeworld, before = "height") ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- starwars |> relocate(sex:homeworld, .before = height) ``` ::: :::: ```{r relocate1, eval = evaluate_chunk, echo = FALSE} ``` In addition to column names, `before` and `after` accept column indices. Finally, one can use `before = -1` to relocate the selected columns just before the last column, or `after = -1` to relocate them after the last column. ```{r eval = evaluate_chunk} # ---------- datawizard ----------- starwars |> data_relocate(sex:homeworld, after = -1) ``` ## Reshaping {#reshaping} ### Longer Reshaping data from wide to long or from long to wide format can be done with `data_to_long()` and `data_to_wide()`. These functions were designed to match `tidyr::pivot_longer()` and `tidyr::pivot_wider()` arguments, so that the only thing to do is to change the function name. However, not all of `tidyr::pivot_longer()` and `tidyr::pivot_wider()` features are available yet. We will use the `relig_income` dataset, as in the [`{tidyr}` vignette](https://tidyr.tidyverse.org/articles/pivot.html). ```{r eval = evaluate_chunk} relig_income ``` We would like to reshape this dataset to have 3 columns: religion, count, and income. The column "religion" doesn't need to change, so we exclude it with `-religion`. Then, each remaining column corresponds to an income category. Therefore, we want to move all these column names to a single column called "income". Finally, the values corresponding to each of these columns will be reshaped to be in a single new column, called "count". :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r pivot1, class.source = "datawizard"} # ---------- datawizard ----------- relig_income |> data_to_long( -religion, names_to = "income", values_to = "count" ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- relig_income |> pivot_longer( !religion, names_to = "income", values_to = "count" ) ``` ::: :::: ```{r pivot1, eval = evaluate_chunk, echo = FALSE} ``` To explore a bit more the arguments of `data_to_long()`, we will use another dataset: the `billboard` dataset. ```{r eval = evaluate_chunk} billboard ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r pivot2, class.source = "datawizard"} # ---------- datawizard ----------- billboard |> data_to_long( cols = starts_with("wk"), names_to = "week", values_to = "rank", values_drop_na = TRUE ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- billboard |> pivot_longer( cols = starts_with("wk"), names_to = "week", values_to = "rank", values_drop_na = TRUE ) ``` ::: :::: ```{r pivot2, eval = evaluate_chunk, echo = FALSE} ``` ### Wider Once again, we use an example in the `{tidyr}` vignette to show how close `data_to_wide()` and `pivot_wider()` are: ```{r eval = evaluate_chunk} fish_encounters ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r pivot3, class.source = "datawizard"} # ---------- datawizard ----------- fish_encounters |> data_to_wide( names_from = "station", values_from = "seen" ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- fish_encounters |> pivot_wider( names_from = station, values_from = seen ) ``` ::: :::: ```{r pivot3, eval = evaluate_chunk, echo = FALSE} ``` ## Joining {#joining} In `{datawizard}`, joining datasets is done with `data_join()` (or its alias `data_merge()`). Contrary to `{dplyr}`, this unique function takes care of all types of join, which are then specified inside the function with the argument `join` (by default, `join = "left"`). Below, we show how to perform the four most common joins: full, left, right and inner. We will use the datasets `band_members`and `band_instruments` provided by `{dplyr}`: :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r eval = evaluate_chunk} band_members ``` ::: ::: {} ```{r eval = evaluate_chunk} band_instruments ``` ::: :::: ### Full join :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r join1, class.source = "datawizard"} # ---------- datawizard ----------- band_members |> data_join(band_instruments, join = "full") ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- band_members |> full_join(band_instruments) ``` ::: :::: ```{r join1, eval = evaluate_chunk, echo = FALSE} ``` ### Left and right joins :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r join2, class.source = "datawizard"} # ---------- datawizard ----------- band_members |> data_join(band_instruments, join = "left") ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- band_members |> left_join(band_instruments) ``` ::: :::: ```{r join2, eval = evaluate_chunk, echo = FALSE} ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r join3, class.source = "datawizard"} # ---------- datawizard ----------- band_members |> data_join(band_instruments, join = "right") ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- band_members |> right_join(band_instruments) ``` ::: :::: ```{r join3, eval = evaluate_chunk, echo = FALSE} ``` ### Inner join :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r join4, class.source = "datawizard"} # ---------- datawizard ----------- band_members |> data_join(band_instruments, join = "inner") ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- band_members |> inner_join(band_instruments) ``` ::: :::: ```{r join4, eval = evaluate_chunk, echo = FALSE} ``` ## Uniting {#uniting} Uniting variables is useful e.g to create unique indices by combining several variables or to gather years, months, and days into a single date. `data_unite()` offers an interface very close to `tidyr::unite()`: ```{r eval=evaluate_chunk} test <- data.frame( year = 2002:2004, month = c("02", "03", "09"), day = c("11", "22", "28"), stringsAsFactors = FALSE ) test ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r unite1, class.source = "datawizard"} # ---------- datawizard ----------- test |> data_unite( new_column = "date", select = c("year", "month", "day"), separator = "-" ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- test |> unite( col = "date", year, month, day, sep = "-" ) ``` ::: :::: ```{r unite1, eval = evaluate_chunk, echo = FALSE} ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r unite2, class.source = "datawizard"} # ---------- datawizard ----------- test |> data_unite( new_column = "date", select = c("year", "month", "day"), separator = "-", append = TRUE ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- test |> unite( col = "date", year, month, day, sep = "-", remove = FALSE ) ``` ::: :::: ```{r unite2, eval = evaluate_chunk, echo = FALSE} ``` ## Separating {#separating} Separating variables is the counterpart to uniting variables and is useful to split values into multiple columns, e.g. when splitting a date into values for years, months and days. `data_separate()` offers an interface very close to `tidyr::separate()`: ```{r eval=evaluate_chunk} test <- data.frame( date_arrival = c("2002-02-11", "2003-03-22", "2004-09-28"), date_departure = c("2002-03-15", "2003-03-28", "2004-09-30"), stringsAsFactors = FALSE ) test ``` :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r separate1, class.source = "datawizard"} # ---------- datawizard ----------- test |> data_separate( select = "date_arrival", new_columns = c("Year", "Month", "Day") ) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- test |> separate( date_arrival, into = c("Year", "Month", "Day") ) ``` ::: :::: ```{r separate1, eval = evaluate_chunk, echo = FALSE} ``` Unlike `tidyr::separate()`, you can separate multiple columns in one step with `data_separate()`. ```{r eval = evaluate_chunk} test |> data_separate( new_columns = list( date_arrival = c("Arr_Year", "Arr_Month", "Arr_Day"), date_departure = c("Dep_Year", "Dep_Month", "Dep_Day") ) ) ``` # Other useful functions `{datawizard}` contains other functions that are not necessarily included in `{dplyr}` or `{tidyr}` or do not directly modify the data. Some of them are inspired from the package `janitor`. ## Work with rownames We can convert a column in rownames and move rownames to a new column with `rownames_as_column()` and `column_as_rownames()`: ```{r eval = evaluate_chunk} mtcars <- head(mtcars) mtcars mtcars2 <- mtcars |> rownames_as_column(var = "model") mtcars2 mtcars2 |> column_as_rownames(var = "model") ``` ## Work with row ids `rowid_as_column()` is close but not identical to `tibble::rowid_to_column()`. The main difference is when we use it with grouped data. While `tibble::rowid_to_column()` uses one distinct rowid for every row in the dataset, `rowid_as_column()` creates one id for every row *in each group*. Therefore, two rows in different groups can have the same row id. This means that `rowid_as_column()` is closer to using `n()` in `mutate()`, like the following: ```{r eval=evaluate_chunk} test <- data.frame( group = c("A", "A", "B", "B"), value = c(3, 5, 8, 1), stringsAsFactors = FALSE ) test test |> data_group(group) |> tibble::rowid_to_column() test |> data_group(group) |> rowid_as_column() test |> data_group(group) |> mutate(id = seq_len(n())) ``` ## Work with column names When dealing with messy data, it is sometimes useful to use a row as column names, and vice versa. This can be done with `row_to_colnames()` and `colnames_to_row()`. ```{r eval = evaluate_chunk} x <- data.frame( X_1 = c(NA, "Title", 1:3), X_2 = c(NA, "Title2", 4:6) ) x x2 <- x |> row_to_colnames(row = 2) x2 x2 |> colnames_to_row() ``` ## Take a quick look at the data :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} ::: {} ```{r glimpse, class.source = "datawizard"} # ---------- datawizard ----------- data_peek(iris) ``` ::: ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- glimpse(iris) ``` ::: :::: ```{r glimpse, eval = evaluate_chunk, echo = FALSE} ```